{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 15996, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.251562890722681e-05, "grad_norm": 26.324966430664062, "learning_rate": 0.0, "loss": 2.5633, "memory/device_reserved (GiB)": 33.76, "memory/max_active (GiB)": 33.61, "memory/max_allocated (GiB)": 33.61, "step": 1, "tokens_per_second_per_gpu": 7217.55, "total_tokens": 96081 }, { "epoch": 0.00012503125781445363, "grad_norm": 26.468585968017578, "learning_rate": 1.2507817385866167e-08, "loss": 2.5629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2, "tokens_per_second_per_gpu": 17846.48, "total_tokens": 198759 }, { "epoch": 0.00018754688672168043, "grad_norm": 24.313596725463867, "learning_rate": 2.5015634771732333e-08, "loss": 2.4421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3, "tokens_per_second_per_gpu": 18576.26, "total_tokens": 299196 }, { "epoch": 0.00025006251562890725, "grad_norm": 25.956329345703125, "learning_rate": 3.75234521575985e-08, "loss": 2.5423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4, "tokens_per_second_per_gpu": 18635.65, "total_tokens": 401928 }, { "epoch": 0.000312578144536134, "grad_norm": 24.033823013305664, "learning_rate": 5.0031269543464667e-08, "loss": 2.5205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5, "tokens_per_second_per_gpu": 17606.6, "total_tokens": 503718 }, { "epoch": 0.00037509377344336085, "grad_norm": 20.873069763183594, "learning_rate": 6.253908692933084e-08, "loss": 2.3644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6, "tokens_per_second_per_gpu": 17150.5, "total_tokens": 599898 }, { "epoch": 0.0004376094023505876, "grad_norm": 25.277870178222656, "learning_rate": 7.5046904315197e-08, "loss": 2.3593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7, "tokens_per_second_per_gpu": 18480.4, "total_tokens": 702657 }, { "epoch": 0.0005001250312578145, "grad_norm": 28.39929962158203, "learning_rate": 8.755472170106318e-08, "loss": 2.56, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8, "tokens_per_second_per_gpu": 16996.91, "total_tokens": 795428 }, { "epoch": 0.0005626406601650412, "grad_norm": 20.36292839050293, "learning_rate": 1.0006253908692933e-07, "loss": 2.1874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9, "tokens_per_second_per_gpu": 18482.51, "total_tokens": 896527 }, { "epoch": 0.000625156289072268, "grad_norm": 24.33814811706543, "learning_rate": 1.125703564727955e-07, "loss": 2.3283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10, "tokens_per_second_per_gpu": 18304.17, "total_tokens": 998733 }, { "epoch": 0.0006876719179794949, "grad_norm": 25.24383544921875, "learning_rate": 1.2507817385866167e-07, "loss": 2.4927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11, "tokens_per_second_per_gpu": 17035.61, "total_tokens": 1097747 }, { "epoch": 0.0007501875468867217, "grad_norm": 23.981609344482422, "learning_rate": 1.3758599124452784e-07, "loss": 2.3075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12, "tokens_per_second_per_gpu": 17210.82, "total_tokens": 1195426 }, { "epoch": 0.0008127031757939485, "grad_norm": 28.674047470092773, "learning_rate": 1.50093808630394e-07, "loss": 2.5794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13, "tokens_per_second_per_gpu": 17838.77, "total_tokens": 1294563 }, { "epoch": 0.0008752188047011753, "grad_norm": 22.898983001708984, "learning_rate": 1.6260162601626018e-07, "loss": 2.1697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14, "tokens_per_second_per_gpu": 17426.33, "total_tokens": 1392412 }, { "epoch": 0.0009377344336084021, "grad_norm": 21.75069808959961, "learning_rate": 1.7510944340212635e-07, "loss": 2.1809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15, "tokens_per_second_per_gpu": 17468.99, "total_tokens": 1491455 }, { "epoch": 0.001000250062515629, "grad_norm": 24.514413833618164, "learning_rate": 1.8761726078799252e-07, "loss": 2.3394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 16, "tokens_per_second_per_gpu": 15393.15, "total_tokens": 1583946 }, { "epoch": 0.0010627656914228556, "grad_norm": 26.91559600830078, "learning_rate": 2.0012507817385867e-07, "loss": 2.5486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 17, "tokens_per_second_per_gpu": 15764.99, "total_tokens": 1677498 }, { "epoch": 0.0011252813203300824, "grad_norm": 24.165000915527344, "learning_rate": 2.1263289555972486e-07, "loss": 2.5293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 18, "tokens_per_second_per_gpu": 16485.84, "total_tokens": 1776368 }, { "epoch": 0.0011877969492373093, "grad_norm": 24.06385612487793, "learning_rate": 2.25140712945591e-07, "loss": 2.3125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 19, "tokens_per_second_per_gpu": 17216.32, "total_tokens": 1873793 }, { "epoch": 0.001250312578144536, "grad_norm": 26.21052360534668, "learning_rate": 2.376485303314572e-07, "loss": 2.4583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 20, "tokens_per_second_per_gpu": 18342.21, "total_tokens": 1973251 }, { "epoch": 0.001312828207051763, "grad_norm": 29.871559143066406, "learning_rate": 2.5015634771732335e-07, "loss": 2.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 21, "tokens_per_second_per_gpu": 17551.81, "total_tokens": 2070926 }, { "epoch": 0.0013753438359589898, "grad_norm": 22.67438507080078, "learning_rate": 2.6266416510318954e-07, "loss": 2.2641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 22, "tokens_per_second_per_gpu": 17533.08, "total_tokens": 2173880 }, { "epoch": 0.0014378594648662166, "grad_norm": 23.700355529785156, "learning_rate": 2.751719824890557e-07, "loss": 2.4281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 23, "tokens_per_second_per_gpu": 18489.38, "total_tokens": 2277742 }, { "epoch": 0.0015003750937734434, "grad_norm": 26.5550537109375, "learning_rate": 2.876797998749219e-07, "loss": 2.4841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 24, "tokens_per_second_per_gpu": 17959.6, "total_tokens": 2375444 }, { "epoch": 0.0015628907226806702, "grad_norm": 26.096166610717773, "learning_rate": 3.00187617260788e-07, "loss": 2.3747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 25, "tokens_per_second_per_gpu": 17206.47, "total_tokens": 2470991 }, { "epoch": 0.001625406351587897, "grad_norm": 26.881195068359375, "learning_rate": 3.1269543464665417e-07, "loss": 2.4891, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 26, "tokens_per_second_per_gpu": 17919.69, "total_tokens": 2567012 }, { "epoch": 0.0016879219804951237, "grad_norm": 26.044544219970703, "learning_rate": 3.2520325203252037e-07, "loss": 2.5574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 27, "tokens_per_second_per_gpu": 16989.81, "total_tokens": 2660023 }, { "epoch": 0.0017504376094023505, "grad_norm": 24.876901626586914, "learning_rate": 3.3771106941838656e-07, "loss": 2.43, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 28, "tokens_per_second_per_gpu": 17103.74, "total_tokens": 2756239 }, { "epoch": 0.0018129532383095773, "grad_norm": 27.036136627197266, "learning_rate": 3.502188868042527e-07, "loss": 2.4381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 29, "tokens_per_second_per_gpu": 16428.7, "total_tokens": 2853616 }, { "epoch": 0.0018754688672168042, "grad_norm": 23.99689483642578, "learning_rate": 3.6272670419011885e-07, "loss": 2.4653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 30, "tokens_per_second_per_gpu": 17861.01, "total_tokens": 2953573 }, { "epoch": 0.001937984496124031, "grad_norm": 24.685932159423828, "learning_rate": 3.7523452157598505e-07, "loss": 2.4116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 31, "tokens_per_second_per_gpu": 17865.51, "total_tokens": 3052344 }, { "epoch": 0.002000500125031258, "grad_norm": 22.14589500427246, "learning_rate": 3.877423389618512e-07, "loss": 2.2399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 32, "tokens_per_second_per_gpu": 17435.08, "total_tokens": 3151989 }, { "epoch": 0.0020630157539384846, "grad_norm": 25.86381721496582, "learning_rate": 4.0025015634771733e-07, "loss": 2.5572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 33, "tokens_per_second_per_gpu": 17163.3, "total_tokens": 3249216 }, { "epoch": 0.0021255313828457112, "grad_norm": 21.62105941772461, "learning_rate": 4.1275797373358353e-07, "loss": 2.1944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 34, "tokens_per_second_per_gpu": 17360.79, "total_tokens": 3346576 }, { "epoch": 0.0021880470117529383, "grad_norm": 24.275415420532227, "learning_rate": 4.252657911194497e-07, "loss": 2.3687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 35, "tokens_per_second_per_gpu": 17931.67, "total_tokens": 3447202 }, { "epoch": 0.002250562640660165, "grad_norm": 24.27597999572754, "learning_rate": 4.3777360850531587e-07, "loss": 2.3014, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 36, "tokens_per_second_per_gpu": 17693.03, "total_tokens": 3548100 }, { "epoch": 0.002313078269567392, "grad_norm": 25.21314811706543, "learning_rate": 4.50281425891182e-07, "loss": 2.3834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 37, "tokens_per_second_per_gpu": 17433.8, "total_tokens": 3647735 }, { "epoch": 0.0023755938984746186, "grad_norm": 21.663822174072266, "learning_rate": 4.6278924327704816e-07, "loss": 2.25, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 38, "tokens_per_second_per_gpu": 18752.36, "total_tokens": 3752677 }, { "epoch": 0.0024381095273818456, "grad_norm": 23.777385711669922, "learning_rate": 4.752970606629144e-07, "loss": 2.404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 39, "tokens_per_second_per_gpu": 17035.81, "total_tokens": 3851067 }, { "epoch": 0.002500625156289072, "grad_norm": 22.576210021972656, "learning_rate": 4.878048780487805e-07, "loss": 2.3567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 40, "tokens_per_second_per_gpu": 17529.18, "total_tokens": 3948755 }, { "epoch": 0.0025631407851962992, "grad_norm": 22.640459060668945, "learning_rate": 5.003126954346467e-07, "loss": 2.1831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 41, "tokens_per_second_per_gpu": 18247.38, "total_tokens": 4052481 }, { "epoch": 0.002625656414103526, "grad_norm": 25.199819564819336, "learning_rate": 5.128205128205128e-07, "loss": 2.413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 42, "tokens_per_second_per_gpu": 17426.92, "total_tokens": 4151226 }, { "epoch": 0.002688172043010753, "grad_norm": 27.899221420288086, "learning_rate": 5.253283302063791e-07, "loss": 2.5678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 43, "tokens_per_second_per_gpu": 16652.95, "total_tokens": 4245798 }, { "epoch": 0.0027506876719179795, "grad_norm": 26.12424087524414, "learning_rate": 5.378361475922452e-07, "loss": 2.4297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 44, "tokens_per_second_per_gpu": 16694.67, "total_tokens": 4341049 }, { "epoch": 0.002813203300825206, "grad_norm": 25.187776565551758, "learning_rate": 5.503439649781114e-07, "loss": 2.3581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 45, "tokens_per_second_per_gpu": 15654.94, "total_tokens": 4436164 }, { "epoch": 0.002875718929732433, "grad_norm": 26.27300262451172, "learning_rate": 5.628517823639775e-07, "loss": 2.5827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 46, "tokens_per_second_per_gpu": 17577.5, "total_tokens": 4533724 }, { "epoch": 0.0029382345586396598, "grad_norm": 20.931150436401367, "learning_rate": 5.753595997498438e-07, "loss": 2.2434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 47, "tokens_per_second_per_gpu": 18162.81, "total_tokens": 4633886 }, { "epoch": 0.003000750187546887, "grad_norm": 24.54862403869629, "learning_rate": 5.878674171357099e-07, "loss": 2.4796, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 48, "tokens_per_second_per_gpu": 18083.4, "total_tokens": 4734430 }, { "epoch": 0.0030632658164541134, "grad_norm": 22.80314826965332, "learning_rate": 6.00375234521576e-07, "loss": 2.3434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 49, "tokens_per_second_per_gpu": 16875.38, "total_tokens": 4831605 }, { "epoch": 0.0031257814453613405, "grad_norm": 23.59935760498047, "learning_rate": 6.128830519074422e-07, "loss": 2.437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 50, "tokens_per_second_per_gpu": 17260.33, "total_tokens": 4928754 }, { "epoch": 0.003188297074268567, "grad_norm": 23.454952239990234, "learning_rate": 6.253908692933083e-07, "loss": 2.326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 51, "tokens_per_second_per_gpu": 16180.84, "total_tokens": 5025918 }, { "epoch": 0.003250812703175794, "grad_norm": 23.36529541015625, "learning_rate": 6.378986866791745e-07, "loss": 2.3244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 52, "tokens_per_second_per_gpu": 18681.42, "total_tokens": 5128239 }, { "epoch": 0.0033133283320830207, "grad_norm": 27.74395179748535, "learning_rate": 6.504065040650407e-07, "loss": 2.5395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 53, "tokens_per_second_per_gpu": 16258.35, "total_tokens": 5223446 }, { "epoch": 0.0033758439609902473, "grad_norm": 26.08795738220215, "learning_rate": 6.629143214509069e-07, "loss": 2.5977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 54, "tokens_per_second_per_gpu": 16707.37, "total_tokens": 5317552 }, { "epoch": 0.0034383595898974744, "grad_norm": 24.23707389831543, "learning_rate": 6.754221388367731e-07, "loss": 2.3889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 55, "tokens_per_second_per_gpu": 17616.13, "total_tokens": 5415958 }, { "epoch": 0.003500875218804701, "grad_norm": 24.12247657775879, "learning_rate": 6.879299562226393e-07, "loss": 2.244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 56, "tokens_per_second_per_gpu": 17049.48, "total_tokens": 5515262 }, { "epoch": 0.003563390847711928, "grad_norm": 27.84832000732422, "learning_rate": 7.004377736085054e-07, "loss": 2.5334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 57, "tokens_per_second_per_gpu": 16544.84, "total_tokens": 5610192 }, { "epoch": 0.0036259064766191547, "grad_norm": 23.407756805419922, "learning_rate": 7.129455909943716e-07, "loss": 2.3384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 58, "tokens_per_second_per_gpu": 17836.38, "total_tokens": 5711624 }, { "epoch": 0.0036884221055263817, "grad_norm": 21.272531509399414, "learning_rate": 7.254534083802377e-07, "loss": 2.1467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 59, "tokens_per_second_per_gpu": 18478.02, "total_tokens": 5813767 }, { "epoch": 0.0037509377344336083, "grad_norm": 25.48737144470215, "learning_rate": 7.379612257661038e-07, "loss": 2.4819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 60, "tokens_per_second_per_gpu": 18920.02, "total_tokens": 5918353 }, { "epoch": 0.0038134533633408354, "grad_norm": 23.47774314880371, "learning_rate": 7.504690431519701e-07, "loss": 2.3082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 61, "tokens_per_second_per_gpu": 18034.0, "total_tokens": 6018375 }, { "epoch": 0.003875968992248062, "grad_norm": 24.352188110351562, "learning_rate": 7.629768605378362e-07, "loss": 2.4588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 62, "tokens_per_second_per_gpu": 17228.32, "total_tokens": 6117556 }, { "epoch": 0.003938484621155289, "grad_norm": 21.751779556274414, "learning_rate": 7.754846779237024e-07, "loss": 2.2422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 63, "tokens_per_second_per_gpu": 16589.15, "total_tokens": 6212337 }, { "epoch": 0.004001000250062516, "grad_norm": 18.974044799804688, "learning_rate": 7.879924953095685e-07, "loss": 2.1573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 64, "tokens_per_second_per_gpu": 17158.04, "total_tokens": 6313131 }, { "epoch": 0.004063515878969743, "grad_norm": 21.18741798400879, "learning_rate": 8.005003126954347e-07, "loss": 2.2738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 65, "tokens_per_second_per_gpu": 18587.22, "total_tokens": 6412238 }, { "epoch": 0.004126031507876969, "grad_norm": 20.35104751586914, "learning_rate": 8.130081300813009e-07, "loss": 2.1407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 66, "tokens_per_second_per_gpu": 17633.97, "total_tokens": 6514187 }, { "epoch": 0.004188547136784196, "grad_norm": 19.78925895690918, "learning_rate": 8.255159474671671e-07, "loss": 2.0944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 67, "tokens_per_second_per_gpu": 18357.62, "total_tokens": 6614161 }, { "epoch": 0.0042510627656914225, "grad_norm": 23.424367904663086, "learning_rate": 8.380237648530332e-07, "loss": 2.4187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 68, "tokens_per_second_per_gpu": 17800.59, "total_tokens": 6715610 }, { "epoch": 0.00431357839459865, "grad_norm": 25.81906509399414, "learning_rate": 8.505315822388995e-07, "loss": 2.4995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 69, "tokens_per_second_per_gpu": 17043.89, "total_tokens": 6807793 }, { "epoch": 0.004376094023505877, "grad_norm": 23.12018585205078, "learning_rate": 8.630393996247656e-07, "loss": 2.2795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 70, "tokens_per_second_per_gpu": 17504.91, "total_tokens": 6908864 }, { "epoch": 0.004438609652413103, "grad_norm": 20.198945999145508, "learning_rate": 8.755472170106317e-07, "loss": 2.1267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 71, "tokens_per_second_per_gpu": 17699.54, "total_tokens": 7005867 }, { "epoch": 0.00450112528132033, "grad_norm": 24.044153213500977, "learning_rate": 8.880550343964979e-07, "loss": 2.3849, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 72, "tokens_per_second_per_gpu": 16242.11, "total_tokens": 7096390 }, { "epoch": 0.004563640910227557, "grad_norm": 22.98520278930664, "learning_rate": 9.00562851782364e-07, "loss": 2.3139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 73, "tokens_per_second_per_gpu": 18083.18, "total_tokens": 7195159 }, { "epoch": 0.004626156539134784, "grad_norm": 18.33698272705078, "learning_rate": 9.130706691682302e-07, "loss": 2.107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 74, "tokens_per_second_per_gpu": 17500.05, "total_tokens": 7300781 }, { "epoch": 0.0046886721680420105, "grad_norm": 24.936655044555664, "learning_rate": 9.255784865540963e-07, "loss": 2.4322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 75, "tokens_per_second_per_gpu": 17471.65, "total_tokens": 7397078 }, { "epoch": 0.004751187796949237, "grad_norm": 23.486305236816406, "learning_rate": 9.380863039399625e-07, "loss": 2.4011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 76, "tokens_per_second_per_gpu": 15762.39, "total_tokens": 7491289 }, { "epoch": 0.004813703425856464, "grad_norm": 19.616729736328125, "learning_rate": 9.505941213258288e-07, "loss": 2.2025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 77, "tokens_per_second_per_gpu": 17559.9, "total_tokens": 7591621 }, { "epoch": 0.004876219054763691, "grad_norm": 18.154634475708008, "learning_rate": 9.631019387116948e-07, "loss": 2.0703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 78, "tokens_per_second_per_gpu": 18110.5, "total_tokens": 7691729 }, { "epoch": 0.004938734683670918, "grad_norm": 20.110092163085938, "learning_rate": 9.75609756097561e-07, "loss": 2.2061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 79, "tokens_per_second_per_gpu": 18222.27, "total_tokens": 7794213 }, { "epoch": 0.005001250312578144, "grad_norm": 21.601760864257812, "learning_rate": 9.881175734834271e-07, "loss": 2.4281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 80, "tokens_per_second_per_gpu": 18591.23, "total_tokens": 7895994 }, { "epoch": 0.005063765941485371, "grad_norm": 21.563676834106445, "learning_rate": 1.0006253908692934e-06, "loss": 2.3633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 81, "tokens_per_second_per_gpu": 16964.09, "total_tokens": 7994495 }, { "epoch": 0.0051262815703925985, "grad_norm": 19.754430770874023, "learning_rate": 1.0131332082551596e-06, "loss": 2.1136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 82, "tokens_per_second_per_gpu": 17716.81, "total_tokens": 8093172 }, { "epoch": 0.005188797199299825, "grad_norm": 22.72014617919922, "learning_rate": 1.0256410256410257e-06, "loss": 2.2502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 83, "tokens_per_second_per_gpu": 16848.74, "total_tokens": 8190453 }, { "epoch": 0.005251312828207052, "grad_norm": 21.32271385192871, "learning_rate": 1.038148843026892e-06, "loss": 2.2072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 84, "tokens_per_second_per_gpu": 16965.27, "total_tokens": 8289084 }, { "epoch": 0.005313828457114278, "grad_norm": 22.06643295288086, "learning_rate": 1.0506566604127582e-06, "loss": 2.26, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 85, "tokens_per_second_per_gpu": 17436.15, "total_tokens": 8386403 }, { "epoch": 0.005376344086021506, "grad_norm": 20.330291748046875, "learning_rate": 1.0631644777986242e-06, "loss": 2.1138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 86, "tokens_per_second_per_gpu": 18506.16, "total_tokens": 8488017 }, { "epoch": 0.005438859714928732, "grad_norm": 22.931745529174805, "learning_rate": 1.0756722951844905e-06, "loss": 2.311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 87, "tokens_per_second_per_gpu": 16766.09, "total_tokens": 8584580 }, { "epoch": 0.005501375343835959, "grad_norm": 22.232810974121094, "learning_rate": 1.0881801125703565e-06, "loss": 2.2612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 88, "tokens_per_second_per_gpu": 16643.67, "total_tokens": 8684508 }, { "epoch": 0.005563890972743186, "grad_norm": 21.132366180419922, "learning_rate": 1.1006879299562227e-06, "loss": 2.1008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 89, "tokens_per_second_per_gpu": 17296.87, "total_tokens": 8784041 }, { "epoch": 0.005626406601650412, "grad_norm": 21.728622436523438, "learning_rate": 1.1131957473420888e-06, "loss": 2.3404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 90, "tokens_per_second_per_gpu": 18338.72, "total_tokens": 8884184 }, { "epoch": 0.00568892223055764, "grad_norm": 22.837305068969727, "learning_rate": 1.125703564727955e-06, "loss": 2.242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 91, "tokens_per_second_per_gpu": 17655.33, "total_tokens": 8985410 }, { "epoch": 0.005751437859464866, "grad_norm": 22.790390014648438, "learning_rate": 1.1382113821138213e-06, "loss": 2.1204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 92, "tokens_per_second_per_gpu": 17427.79, "total_tokens": 9086109 }, { "epoch": 0.005813953488372093, "grad_norm": 16.759748458862305, "learning_rate": 1.1507191994996875e-06, "loss": 2.0202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 93, "tokens_per_second_per_gpu": 17779.6, "total_tokens": 9190188 }, { "epoch": 0.0058764691172793196, "grad_norm": 19.306867599487305, "learning_rate": 1.1632270168855536e-06, "loss": 2.0437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 94, "tokens_per_second_per_gpu": 17284.42, "total_tokens": 9290237 }, { "epoch": 0.005938984746186547, "grad_norm": 18.88347625732422, "learning_rate": 1.1757348342714198e-06, "loss": 2.1387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 95, "tokens_per_second_per_gpu": 17505.7, "total_tokens": 9391717 }, { "epoch": 0.006001500375093774, "grad_norm": 20.623004913330078, "learning_rate": 1.1882426516572859e-06, "loss": 2.1873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 96, "tokens_per_second_per_gpu": 17071.2, "total_tokens": 9490401 }, { "epoch": 0.006064016004001, "grad_norm": 18.602373123168945, "learning_rate": 1.200750469043152e-06, "loss": 2.1875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 97, "tokens_per_second_per_gpu": 17988.7, "total_tokens": 9592169 }, { "epoch": 0.006126531632908227, "grad_norm": 22.026641845703125, "learning_rate": 1.2132582864290181e-06, "loss": 2.1881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 98, "tokens_per_second_per_gpu": 16485.81, "total_tokens": 9689075 }, { "epoch": 0.0061890472618154535, "grad_norm": 17.79117774963379, "learning_rate": 1.2257661038148844e-06, "loss": 2.0404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 99, "tokens_per_second_per_gpu": 16852.04, "total_tokens": 9790467 }, { "epoch": 0.006251562890722681, "grad_norm": 21.099042892456055, "learning_rate": 1.2382739212007504e-06, "loss": 2.1946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 100, "tokens_per_second_per_gpu": 17153.68, "total_tokens": 9889934 }, { "epoch": 0.0063140785196299076, "grad_norm": 17.772851943969727, "learning_rate": 1.2507817385866167e-06, "loss": 2.0153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 101, "tokens_per_second_per_gpu": 17460.4, "total_tokens": 9989296 }, { "epoch": 0.006376594148537134, "grad_norm": 22.0889892578125, "learning_rate": 1.2632895559724827e-06, "loss": 2.115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 102, "tokens_per_second_per_gpu": 17353.06, "total_tokens": 10088310 }, { "epoch": 0.006439109777444361, "grad_norm": 15.643810272216797, "learning_rate": 1.275797373358349e-06, "loss": 1.9145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 103, "tokens_per_second_per_gpu": 18059.54, "total_tokens": 10188519 }, { "epoch": 0.006501625406351588, "grad_norm": 15.57465934753418, "learning_rate": 1.2883051907442152e-06, "loss": 1.9562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 104, "tokens_per_second_per_gpu": 17302.75, "total_tokens": 10288054 }, { "epoch": 0.006564141035258815, "grad_norm": 18.141143798828125, "learning_rate": 1.3008130081300815e-06, "loss": 2.1002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 105, "tokens_per_second_per_gpu": 17118.99, "total_tokens": 10384830 }, { "epoch": 0.0066266566641660415, "grad_norm": 17.949281692504883, "learning_rate": 1.3133208255159477e-06, "loss": 2.0643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 106, "tokens_per_second_per_gpu": 17131.21, "total_tokens": 10484571 }, { "epoch": 0.006689172293073268, "grad_norm": 13.156450271606445, "learning_rate": 1.3258286429018137e-06, "loss": 1.872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 107, "tokens_per_second_per_gpu": 18113.38, "total_tokens": 10589942 }, { "epoch": 0.006751687921980495, "grad_norm": 18.153148651123047, "learning_rate": 1.33833646028768e-06, "loss": 2.1496, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 108, "tokens_per_second_per_gpu": 17146.28, "total_tokens": 10685607 }, { "epoch": 0.006814203550887722, "grad_norm": 19.097515106201172, "learning_rate": 1.3508442776735462e-06, "loss": 2.0696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 109, "tokens_per_second_per_gpu": 16981.79, "total_tokens": 10784279 }, { "epoch": 0.006876719179794949, "grad_norm": 15.56169605255127, "learning_rate": 1.3633520950594123e-06, "loss": 2.0141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 110, "tokens_per_second_per_gpu": 17857.61, "total_tokens": 10888604 }, { "epoch": 0.006939234808702175, "grad_norm": 15.851075172424316, "learning_rate": 1.3758599124452785e-06, "loss": 1.9048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 111, "tokens_per_second_per_gpu": 17968.06, "total_tokens": 10990397 }, { "epoch": 0.007001750437609402, "grad_norm": 19.134414672851562, "learning_rate": 1.3883677298311446e-06, "loss": 2.0396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 112, "tokens_per_second_per_gpu": 17104.07, "total_tokens": 11085799 }, { "epoch": 0.0070642660665166295, "grad_norm": 15.184460639953613, "learning_rate": 1.4008755472170108e-06, "loss": 1.911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 113, "tokens_per_second_per_gpu": 18119.34, "total_tokens": 11187909 }, { "epoch": 0.007126781695423856, "grad_norm": 17.306493759155273, "learning_rate": 1.4133833646028769e-06, "loss": 2.0032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 114, "tokens_per_second_per_gpu": 16463.25, "total_tokens": 11287342 }, { "epoch": 0.007189297324331083, "grad_norm": 17.451339721679688, "learning_rate": 1.4258911819887431e-06, "loss": 2.0572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 115, "tokens_per_second_per_gpu": 18346.92, "total_tokens": 11388650 }, { "epoch": 0.007251812953238309, "grad_norm": 18.153676986694336, "learning_rate": 1.4383989993746091e-06, "loss": 2.0478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 116, "tokens_per_second_per_gpu": 16714.59, "total_tokens": 11485585 }, { "epoch": 0.007314328582145537, "grad_norm": 18.03830909729004, "learning_rate": 1.4509068167604754e-06, "loss": 1.8548, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 117, "tokens_per_second_per_gpu": 17729.14, "total_tokens": 11582798 }, { "epoch": 0.007376844211052763, "grad_norm": 16.39058494567871, "learning_rate": 1.4634146341463414e-06, "loss": 2.0164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 118, "tokens_per_second_per_gpu": 17224.45, "total_tokens": 11682981 }, { "epoch": 0.00743935983995999, "grad_norm": 17.338871002197266, "learning_rate": 1.4759224515322077e-06, "loss": 1.926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 119, "tokens_per_second_per_gpu": 16865.04, "total_tokens": 11781124 }, { "epoch": 0.007501875468867217, "grad_norm": 14.978799819946289, "learning_rate": 1.4884302689180737e-06, "loss": 1.9018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 120, "tokens_per_second_per_gpu": 17454.21, "total_tokens": 11880498 }, { "epoch": 0.007564391097774443, "grad_norm": 15.609264373779297, "learning_rate": 1.5009380863039402e-06, "loss": 2.0214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 121, "tokens_per_second_per_gpu": 17640.18, "total_tokens": 11981037 }, { "epoch": 0.007626906726681671, "grad_norm": 13.015109062194824, "learning_rate": 1.5134459036898064e-06, "loss": 1.8495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 122, "tokens_per_second_per_gpu": 18483.24, "total_tokens": 12084719 }, { "epoch": 0.007689422355588897, "grad_norm": 16.41800308227539, "learning_rate": 1.5259537210756725e-06, "loss": 1.8985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 123, "tokens_per_second_per_gpu": 16658.65, "total_tokens": 12179421 }, { "epoch": 0.007751937984496124, "grad_norm": 15.437102317810059, "learning_rate": 1.5384615384615387e-06, "loss": 1.982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 124, "tokens_per_second_per_gpu": 17114.46, "total_tokens": 12274855 }, { "epoch": 0.007814453613403351, "grad_norm": 16.093629837036133, "learning_rate": 1.5509693558474048e-06, "loss": 1.9734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 125, "tokens_per_second_per_gpu": 16629.94, "total_tokens": 12370746 }, { "epoch": 0.007876969242310577, "grad_norm": 15.326961517333984, "learning_rate": 1.563477173233271e-06, "loss": 1.9601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 126, "tokens_per_second_per_gpu": 16536.1, "total_tokens": 12466422 }, { "epoch": 0.007939484871217805, "grad_norm": 13.927566528320312, "learning_rate": 1.575984990619137e-06, "loss": 1.9033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 127, "tokens_per_second_per_gpu": 18364.48, "total_tokens": 12571093 }, { "epoch": 0.008002000500125032, "grad_norm": 11.645222663879395, "learning_rate": 1.5884928080050033e-06, "loss": 1.7169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 128, "tokens_per_second_per_gpu": 16978.14, "total_tokens": 12668150 }, { "epoch": 0.008064516129032258, "grad_norm": 14.653776168823242, "learning_rate": 1.6010006253908693e-06, "loss": 1.9434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 129, "tokens_per_second_per_gpu": 17433.17, "total_tokens": 12768027 }, { "epoch": 0.008127031757939485, "grad_norm": 18.511003494262695, "learning_rate": 1.6135084427767356e-06, "loss": 2.0108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 130, "tokens_per_second_per_gpu": 17100.44, "total_tokens": 12866125 }, { "epoch": 0.008189547386846711, "grad_norm": 14.988323211669922, "learning_rate": 1.6260162601626018e-06, "loss": 1.9593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 131, "tokens_per_second_per_gpu": 18429.25, "total_tokens": 12967132 }, { "epoch": 0.008252063015753939, "grad_norm": 16.131061553955078, "learning_rate": 1.6385240775484679e-06, "loss": 2.0054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 132, "tokens_per_second_per_gpu": 16842.74, "total_tokens": 13068644 }, { "epoch": 0.008314578644661166, "grad_norm": 13.400984764099121, "learning_rate": 1.6510318949343341e-06, "loss": 1.8427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 133, "tokens_per_second_per_gpu": 18674.33, "total_tokens": 13172801 }, { "epoch": 0.008377094273568392, "grad_norm": 11.853116035461426, "learning_rate": 1.6635397123202002e-06, "loss": 1.8012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 134, "tokens_per_second_per_gpu": 17345.73, "total_tokens": 13275390 }, { "epoch": 0.00843960990247562, "grad_norm": 16.59583282470703, "learning_rate": 1.6760475297060664e-06, "loss": 1.8934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 135, "tokens_per_second_per_gpu": 17485.05, "total_tokens": 13369291 }, { "epoch": 0.008502125531382845, "grad_norm": 13.592601776123047, "learning_rate": 1.6885553470919324e-06, "loss": 1.7935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 136, "tokens_per_second_per_gpu": 15899.24, "total_tokens": 13467169 }, { "epoch": 0.008564641160290072, "grad_norm": 14.683588027954102, "learning_rate": 1.701063164477799e-06, "loss": 1.8087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 137, "tokens_per_second_per_gpu": 17377.09, "total_tokens": 13564378 }, { "epoch": 0.0086271567891973, "grad_norm": 12.891793251037598, "learning_rate": 1.7135709818636651e-06, "loss": 1.8176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 138, "tokens_per_second_per_gpu": 17437.67, "total_tokens": 13663279 }, { "epoch": 0.008689672418104526, "grad_norm": 14.808735847473145, "learning_rate": 1.7260787992495312e-06, "loss": 1.8499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 139, "tokens_per_second_per_gpu": 17599.33, "total_tokens": 13760779 }, { "epoch": 0.008752188047011753, "grad_norm": 13.170564651489258, "learning_rate": 1.7385866166353974e-06, "loss": 1.8302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 140, "tokens_per_second_per_gpu": 17012.69, "total_tokens": 13857897 }, { "epoch": 0.008814703675918979, "grad_norm": 11.875057220458984, "learning_rate": 1.7510944340212635e-06, "loss": 1.7649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 141, "tokens_per_second_per_gpu": 17212.97, "total_tokens": 13959011 }, { "epoch": 0.008877219304826206, "grad_norm": 10.065364837646484, "learning_rate": 1.7636022514071297e-06, "loss": 1.6872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 142, "tokens_per_second_per_gpu": 16826.44, "total_tokens": 14054773 }, { "epoch": 0.008939734933733434, "grad_norm": 10.185818672180176, "learning_rate": 1.7761100687929958e-06, "loss": 1.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 143, "tokens_per_second_per_gpu": 17290.68, "total_tokens": 14151689 }, { "epoch": 0.00900225056264066, "grad_norm": 9.334845542907715, "learning_rate": 1.788617886178862e-06, "loss": 1.7063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 144, "tokens_per_second_per_gpu": 17175.14, "total_tokens": 14248528 }, { "epoch": 0.009064766191547887, "grad_norm": 11.071475982666016, "learning_rate": 1.801125703564728e-06, "loss": 1.7246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 145, "tokens_per_second_per_gpu": 17208.71, "total_tokens": 14346563 }, { "epoch": 0.009127281820455115, "grad_norm": 10.5105619430542, "learning_rate": 1.8136335209505943e-06, "loss": 1.7242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 146, "tokens_per_second_per_gpu": 17628.02, "total_tokens": 14445485 }, { "epoch": 0.00918979744936234, "grad_norm": 9.024543762207031, "learning_rate": 1.8261413383364603e-06, "loss": 1.6719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 147, "tokens_per_second_per_gpu": 17663.35, "total_tokens": 14547680 }, { "epoch": 0.009252313078269568, "grad_norm": 11.94596004486084, "learning_rate": 1.8386491557223266e-06, "loss": 1.7694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 148, "tokens_per_second_per_gpu": 17118.0, "total_tokens": 14641784 }, { "epoch": 0.009314828707176794, "grad_norm": 12.338186264038086, "learning_rate": 1.8511569731081926e-06, "loss": 1.7237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 149, "tokens_per_second_per_gpu": 16700.57, "total_tokens": 14737126 }, { "epoch": 0.009377344336084021, "grad_norm": 9.237903594970703, "learning_rate": 1.8636647904940589e-06, "loss": 1.7051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 150, "tokens_per_second_per_gpu": 16941.56, "total_tokens": 14833084 }, { "epoch": 0.009439859964991248, "grad_norm": 8.428494453430176, "learning_rate": 1.876172607879925e-06, "loss": 1.6568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 151, "tokens_per_second_per_gpu": 16724.84, "total_tokens": 14935830 }, { "epoch": 0.009502375593898474, "grad_norm": 8.985818862915039, "learning_rate": 1.8886804252657912e-06, "loss": 1.7042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 152, "tokens_per_second_per_gpu": 18640.45, "total_tokens": 15038543 }, { "epoch": 0.009564891222805702, "grad_norm": 9.939017295837402, "learning_rate": 1.9011882426516576e-06, "loss": 1.7207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 153, "tokens_per_second_per_gpu": 16603.93, "total_tokens": 15131276 }, { "epoch": 0.009627406851712927, "grad_norm": 9.700238227844238, "learning_rate": 1.9136960600375237e-06, "loss": 1.7404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 154, "tokens_per_second_per_gpu": 16222.78, "total_tokens": 15221662 }, { "epoch": 0.009689922480620155, "grad_norm": 8.096330642700195, "learning_rate": 1.9262038774233897e-06, "loss": 1.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 155, "tokens_per_second_per_gpu": 17761.51, "total_tokens": 15320952 }, { "epoch": 0.009752438109527382, "grad_norm": 7.867141246795654, "learning_rate": 1.938711694809256e-06, "loss": 1.5827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 156, "tokens_per_second_per_gpu": 17065.04, "total_tokens": 15418197 }, { "epoch": 0.009814953738434608, "grad_norm": 7.580371856689453, "learning_rate": 1.951219512195122e-06, "loss": 1.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 157, "tokens_per_second_per_gpu": 16329.9, "total_tokens": 15517373 }, { "epoch": 0.009877469367341836, "grad_norm": 7.987936496734619, "learning_rate": 1.9637273295809882e-06, "loss": 1.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 158, "tokens_per_second_per_gpu": 16359.65, "total_tokens": 15614076 }, { "epoch": 0.009939984996249063, "grad_norm": 5.578784942626953, "learning_rate": 1.9762351469668543e-06, "loss": 1.5231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 159, "tokens_per_second_per_gpu": 17765.93, "total_tokens": 15714086 }, { "epoch": 0.010002500625156289, "grad_norm": 5.735484600067139, "learning_rate": 1.9887429643527207e-06, "loss": 1.5466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 160, "tokens_per_second_per_gpu": 18164.83, "total_tokens": 15814402 }, { "epoch": 0.010065016254063516, "grad_norm": 5.818664073944092, "learning_rate": 2.0012507817385868e-06, "loss": 1.5213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 161, "tokens_per_second_per_gpu": 16726.01, "total_tokens": 15912117 }, { "epoch": 0.010127531882970742, "grad_norm": 6.772164344787598, "learning_rate": 2.013758599124453e-06, "loss": 1.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 162, "tokens_per_second_per_gpu": 17552.33, "total_tokens": 16010770 }, { "epoch": 0.01019004751187797, "grad_norm": 6.757411479949951, "learning_rate": 2.0262664165103193e-06, "loss": 1.5589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 163, "tokens_per_second_per_gpu": 16558.34, "total_tokens": 16106862 }, { "epoch": 0.010252563140785197, "grad_norm": 5.303125381469727, "learning_rate": 2.0387742338961853e-06, "loss": 1.5258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 164, "tokens_per_second_per_gpu": 17826.4, "total_tokens": 16206169 }, { "epoch": 0.010315078769692423, "grad_norm": 6.100436210632324, "learning_rate": 2.0512820512820513e-06, "loss": 1.5649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 165, "tokens_per_second_per_gpu": 17359.65, "total_tokens": 16304665 }, { "epoch": 0.01037759439859965, "grad_norm": 4.756083965301514, "learning_rate": 2.0637898686679174e-06, "loss": 1.4911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 166, "tokens_per_second_per_gpu": 17719.83, "total_tokens": 16406648 }, { "epoch": 0.010440110027506876, "grad_norm": 5.764566421508789, "learning_rate": 2.076297686053784e-06, "loss": 1.5822, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 167, "tokens_per_second_per_gpu": 17031.31, "total_tokens": 16503679 }, { "epoch": 0.010502625656414103, "grad_norm": 5.581164836883545, "learning_rate": 2.08880550343965e-06, "loss": 1.5295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 168, "tokens_per_second_per_gpu": 16724.0, "total_tokens": 16603303 }, { "epoch": 0.010565141285321331, "grad_norm": 4.830110549926758, "learning_rate": 2.1013133208255163e-06, "loss": 1.487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 169, "tokens_per_second_per_gpu": 16293.18, "total_tokens": 16697818 }, { "epoch": 0.010627656914228557, "grad_norm": 4.93554162979126, "learning_rate": 2.1138211382113824e-06, "loss": 1.4818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 170, "tokens_per_second_per_gpu": 17849.65, "total_tokens": 16799794 }, { "epoch": 0.010690172543135784, "grad_norm": 4.798553943634033, "learning_rate": 2.1263289555972484e-06, "loss": 1.5018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 171, "tokens_per_second_per_gpu": 17126.8, "total_tokens": 16897885 }, { "epoch": 0.010752688172043012, "grad_norm": 4.8656768798828125, "learning_rate": 2.138836772983115e-06, "loss": 1.4612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 172, "tokens_per_second_per_gpu": 16786.99, "total_tokens": 16994174 }, { "epoch": 0.010815203800950237, "grad_norm": 4.683867454528809, "learning_rate": 2.151344590368981e-06, "loss": 1.5182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 173, "tokens_per_second_per_gpu": 18192.22, "total_tokens": 17096718 }, { "epoch": 0.010877719429857465, "grad_norm": 5.058762073516846, "learning_rate": 2.163852407754847e-06, "loss": 1.5165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 174, "tokens_per_second_per_gpu": 16900.06, "total_tokens": 17194711 }, { "epoch": 0.01094023505876469, "grad_norm": 5.08579158782959, "learning_rate": 2.176360225140713e-06, "loss": 1.5034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 175, "tokens_per_second_per_gpu": 17065.01, "total_tokens": 17292490 }, { "epoch": 0.011002750687671918, "grad_norm": 4.451949596405029, "learning_rate": 2.1888680425265794e-06, "loss": 1.4535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 176, "tokens_per_second_per_gpu": 17026.6, "total_tokens": 17386958 }, { "epoch": 0.011065266316579146, "grad_norm": 3.86360764503479, "learning_rate": 2.2013758599124455e-06, "loss": 1.4667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 177, "tokens_per_second_per_gpu": 16780.55, "total_tokens": 17486842 }, { "epoch": 0.011127781945486371, "grad_norm": 4.5771484375, "learning_rate": 2.2138836772983115e-06, "loss": 1.4643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 178, "tokens_per_second_per_gpu": 17764.98, "total_tokens": 17585901 }, { "epoch": 0.011190297574393599, "grad_norm": 4.188877105712891, "learning_rate": 2.2263914946841776e-06, "loss": 1.4653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 179, "tokens_per_second_per_gpu": 17026.17, "total_tokens": 17683507 }, { "epoch": 0.011252813203300824, "grad_norm": 3.6809422969818115, "learning_rate": 2.238899312070044e-06, "loss": 1.4025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 180, "tokens_per_second_per_gpu": 17486.45, "total_tokens": 17784638 }, { "epoch": 0.011315328832208052, "grad_norm": 3.6980698108673096, "learning_rate": 2.25140712945591e-06, "loss": 1.4553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 181, "tokens_per_second_per_gpu": 17160.33, "total_tokens": 17887139 }, { "epoch": 0.01137784446111528, "grad_norm": 3.1216323375701904, "learning_rate": 2.263914946841776e-06, "loss": 1.4022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 182, "tokens_per_second_per_gpu": 17097.26, "total_tokens": 17984769 }, { "epoch": 0.011440360090022505, "grad_norm": 3.248230218887329, "learning_rate": 2.2764227642276426e-06, "loss": 1.41, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 183, "tokens_per_second_per_gpu": 17617.77, "total_tokens": 18083646 }, { "epoch": 0.011502875718929733, "grad_norm": 3.5108790397644043, "learning_rate": 2.2889305816135086e-06, "loss": 1.4413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 184, "tokens_per_second_per_gpu": 16767.21, "total_tokens": 18178705 }, { "epoch": 0.011565391347836958, "grad_norm": 3.4572463035583496, "learning_rate": 2.301438398999375e-06, "loss": 1.4316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 185, "tokens_per_second_per_gpu": 17855.16, "total_tokens": 18280330 }, { "epoch": 0.011627906976744186, "grad_norm": 2.8284857273101807, "learning_rate": 2.313946216385241e-06, "loss": 1.4489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 186, "tokens_per_second_per_gpu": 17432.65, "total_tokens": 18384261 }, { "epoch": 0.011690422605651413, "grad_norm": 2.9524996280670166, "learning_rate": 2.326454033771107e-06, "loss": 1.4336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 187, "tokens_per_second_per_gpu": 15708.39, "total_tokens": 18479484 }, { "epoch": 0.011752938234558639, "grad_norm": 3.04443359375, "learning_rate": 2.3389618511569736e-06, "loss": 1.4466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 188, "tokens_per_second_per_gpu": 18174.21, "total_tokens": 18580534 }, { "epoch": 0.011815453863465867, "grad_norm": 2.7444958686828613, "learning_rate": 2.3514696685428396e-06, "loss": 1.3918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 189, "tokens_per_second_per_gpu": 17709.83, "total_tokens": 18678498 }, { "epoch": 0.011877969492373094, "grad_norm": 3.3306994438171387, "learning_rate": 2.3639774859287057e-06, "loss": 1.4763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 190, "tokens_per_second_per_gpu": 17231.7, "total_tokens": 18779723 }, { "epoch": 0.01194048512128032, "grad_norm": 2.559199094772339, "learning_rate": 2.3764853033145717e-06, "loss": 1.4108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 191, "tokens_per_second_per_gpu": 17686.72, "total_tokens": 18881368 }, { "epoch": 0.012003000750187547, "grad_norm": 2.800387382507324, "learning_rate": 2.388993120700438e-06, "loss": 1.4138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 192, "tokens_per_second_per_gpu": 15728.32, "total_tokens": 18977363 }, { "epoch": 0.012065516379094773, "grad_norm": 2.3012208938598633, "learning_rate": 2.401500938086304e-06, "loss": 1.3316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 193, "tokens_per_second_per_gpu": 17684.44, "total_tokens": 19075542 }, { "epoch": 0.012128032008002, "grad_norm": 2.4024012088775635, "learning_rate": 2.4140087554721702e-06, "loss": 1.4019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 194, "tokens_per_second_per_gpu": 18468.06, "total_tokens": 19179011 }, { "epoch": 0.012190547636909228, "grad_norm": 2.3665153980255127, "learning_rate": 2.4265165728580363e-06, "loss": 1.4106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 195, "tokens_per_second_per_gpu": 18401.75, "total_tokens": 19279500 }, { "epoch": 0.012253063265816454, "grad_norm": 2.759556531906128, "learning_rate": 2.4390243902439027e-06, "loss": 1.3858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 196, "tokens_per_second_per_gpu": 16888.83, "total_tokens": 19374575 }, { "epoch": 0.012315578894723681, "grad_norm": 2.6344001293182373, "learning_rate": 2.4515322076297688e-06, "loss": 1.4238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 197, "tokens_per_second_per_gpu": 17833.65, "total_tokens": 19477557 }, { "epoch": 0.012378094523630907, "grad_norm": 2.9381067752838135, "learning_rate": 2.464040025015635e-06, "loss": 1.427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 198, "tokens_per_second_per_gpu": 15927.67, "total_tokens": 19571893 }, { "epoch": 0.012440610152538134, "grad_norm": 2.6263370513916016, "learning_rate": 2.476547842401501e-06, "loss": 1.3403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 199, "tokens_per_second_per_gpu": 16775.06, "total_tokens": 19668114 }, { "epoch": 0.012503125781445362, "grad_norm": 2.443824291229248, "learning_rate": 2.4890556597873673e-06, "loss": 1.4095, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 200, "tokens_per_second_per_gpu": 18320.59, "total_tokens": 19769575 }, { "epoch": 0.012565641410352588, "grad_norm": 2.1144909858703613, "learning_rate": 2.5015634771732334e-06, "loss": 1.3133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 201, "tokens_per_second_per_gpu": 18283.51, "total_tokens": 19869183 }, { "epoch": 0.012628157039259815, "grad_norm": 2.4324536323547363, "learning_rate": 2.5140712945590994e-06, "loss": 1.3704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 202, "tokens_per_second_per_gpu": 17110.22, "total_tokens": 19966665 }, { "epoch": 0.012690672668167043, "grad_norm": 2.2179200649261475, "learning_rate": 2.5265791119449654e-06, "loss": 1.3804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 203, "tokens_per_second_per_gpu": 16950.95, "total_tokens": 20064099 }, { "epoch": 0.012753188297074268, "grad_norm": 2.264585256576538, "learning_rate": 2.539086929330832e-06, "loss": 1.3801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 204, "tokens_per_second_per_gpu": 16854.28, "total_tokens": 20161477 }, { "epoch": 0.012815703925981496, "grad_norm": 2.1338436603546143, "learning_rate": 2.551594746716698e-06, "loss": 1.3504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 205, "tokens_per_second_per_gpu": 17491.59, "total_tokens": 20263274 }, { "epoch": 0.012878219554888722, "grad_norm": 2.1421353816986084, "learning_rate": 2.564102564102564e-06, "loss": 1.3316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 206, "tokens_per_second_per_gpu": 18059.29, "total_tokens": 20366390 }, { "epoch": 0.012940735183795949, "grad_norm": 2.307157516479492, "learning_rate": 2.5766103814884304e-06, "loss": 1.3544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 207, "tokens_per_second_per_gpu": 16603.96, "total_tokens": 20462880 }, { "epoch": 0.013003250812703177, "grad_norm": 2.061502695083618, "learning_rate": 2.5891181988742965e-06, "loss": 1.3659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 208, "tokens_per_second_per_gpu": 17837.59, "total_tokens": 20564460 }, { "epoch": 0.013065766441610402, "grad_norm": 2.1192238330841064, "learning_rate": 2.601626016260163e-06, "loss": 1.3297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 209, "tokens_per_second_per_gpu": 17178.8, "total_tokens": 20659480 }, { "epoch": 0.01312828207051763, "grad_norm": 1.8877434730529785, "learning_rate": 2.6141338336460294e-06, "loss": 1.2548, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 210, "tokens_per_second_per_gpu": 14768.55, "total_tokens": 20749852 }, { "epoch": 0.013190797699424855, "grad_norm": 1.9897915124893188, "learning_rate": 2.6266416510318954e-06, "loss": 1.2762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 211, "tokens_per_second_per_gpu": 15470.15, "total_tokens": 20847494 }, { "epoch": 0.013253313328332083, "grad_norm": 1.8568413257598877, "learning_rate": 2.6391494684177615e-06, "loss": 1.3163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 212, "tokens_per_second_per_gpu": 16589.66, "total_tokens": 20944870 }, { "epoch": 0.01331582895723931, "grad_norm": 1.86903977394104, "learning_rate": 2.6516572858036275e-06, "loss": 1.3656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 213, "tokens_per_second_per_gpu": 18523.0, "total_tokens": 21048066 }, { "epoch": 0.013378344586146536, "grad_norm": 1.9296481609344482, "learning_rate": 2.664165103189494e-06, "loss": 1.2767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 214, "tokens_per_second_per_gpu": 17489.89, "total_tokens": 21147487 }, { "epoch": 0.013440860215053764, "grad_norm": 2.204780101776123, "learning_rate": 2.67667292057536e-06, "loss": 1.266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 215, "tokens_per_second_per_gpu": 16420.13, "total_tokens": 21245351 }, { "epoch": 0.01350337584396099, "grad_norm": 2.3938467502593994, "learning_rate": 2.689180737961226e-06, "loss": 1.3629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 216, "tokens_per_second_per_gpu": 16123.89, "total_tokens": 21338647 }, { "epoch": 0.013565891472868217, "grad_norm": 1.7923985719680786, "learning_rate": 2.7016885553470925e-06, "loss": 1.2607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 217, "tokens_per_second_per_gpu": 18247.51, "total_tokens": 21441532 }, { "epoch": 0.013628407101775444, "grad_norm": 2.9308600425720215, "learning_rate": 2.7141963727329585e-06, "loss": 1.382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 218, "tokens_per_second_per_gpu": 18405.58, "total_tokens": 21540709 }, { "epoch": 0.01369092273068267, "grad_norm": 1.9302117824554443, "learning_rate": 2.7267041901188246e-06, "loss": 1.3119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 219, "tokens_per_second_per_gpu": 17000.29, "total_tokens": 21639084 }, { "epoch": 0.013753438359589898, "grad_norm": 1.7099583148956299, "learning_rate": 2.7392120075046906e-06, "loss": 1.2806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 220, "tokens_per_second_per_gpu": 17511.33, "total_tokens": 21738755 }, { "epoch": 0.013815953988497125, "grad_norm": 1.830113172531128, "learning_rate": 2.751719824890557e-06, "loss": 1.2628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 221, "tokens_per_second_per_gpu": 17067.32, "total_tokens": 21836226 }, { "epoch": 0.01387846961740435, "grad_norm": 1.8049906492233276, "learning_rate": 2.764227642276423e-06, "loss": 1.3017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 222, "tokens_per_second_per_gpu": 17720.65, "total_tokens": 21934790 }, { "epoch": 0.013940985246311578, "grad_norm": 1.8552165031433105, "learning_rate": 2.776735459662289e-06, "loss": 1.3353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 223, "tokens_per_second_per_gpu": 16877.72, "total_tokens": 22032991 }, { "epoch": 0.014003500875218804, "grad_norm": 1.7588484287261963, "learning_rate": 2.789243277048155e-06, "loss": 1.2533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 224, "tokens_per_second_per_gpu": 17830.04, "total_tokens": 22132794 }, { "epoch": 0.014066016504126031, "grad_norm": 2.1645116806030273, "learning_rate": 2.8017510944340216e-06, "loss": 1.3251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 225, "tokens_per_second_per_gpu": 15739.62, "total_tokens": 22228861 }, { "epoch": 0.014128532133033259, "grad_norm": 1.9577350616455078, "learning_rate": 2.8142589118198877e-06, "loss": 1.3, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 226, "tokens_per_second_per_gpu": 17784.13, "total_tokens": 22327839 }, { "epoch": 0.014191047761940485, "grad_norm": 1.9526692628860474, "learning_rate": 2.8267667292057537e-06, "loss": 1.2977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 227, "tokens_per_second_per_gpu": 16925.29, "total_tokens": 22424610 }, { "epoch": 0.014253563390847712, "grad_norm": 1.7703922986984253, "learning_rate": 2.8392745465916198e-06, "loss": 1.2558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 228, "tokens_per_second_per_gpu": 17734.96, "total_tokens": 22524752 }, { "epoch": 0.014316079019754938, "grad_norm": 1.5953654050827026, "learning_rate": 2.8517823639774862e-06, "loss": 1.2955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 229, "tokens_per_second_per_gpu": 17513.19, "total_tokens": 22624780 }, { "epoch": 0.014378594648662165, "grad_norm": 1.7202794551849365, "learning_rate": 2.8642901813633523e-06, "loss": 1.2576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 230, "tokens_per_second_per_gpu": 17078.53, "total_tokens": 22717858 }, { "epoch": 0.014441110277569393, "grad_norm": 1.74393892288208, "learning_rate": 2.8767979987492183e-06, "loss": 1.2384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 231, "tokens_per_second_per_gpu": 17446.79, "total_tokens": 22817568 }, { "epoch": 0.014503625906476619, "grad_norm": 1.6043472290039062, "learning_rate": 2.8893058161350843e-06, "loss": 1.3009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 232, "tokens_per_second_per_gpu": 17314.49, "total_tokens": 22916418 }, { "epoch": 0.014566141535383846, "grad_norm": 1.7715835571289062, "learning_rate": 2.901813633520951e-06, "loss": 1.2747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 233, "tokens_per_second_per_gpu": 16773.01, "total_tokens": 23012221 }, { "epoch": 0.014628657164291074, "grad_norm": 1.6812593936920166, "learning_rate": 2.914321450906817e-06, "loss": 1.2348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 234, "tokens_per_second_per_gpu": 16106.13, "total_tokens": 23105885 }, { "epoch": 0.0146911727931983, "grad_norm": 1.8911315202713013, "learning_rate": 2.926829268292683e-06, "loss": 1.3215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 235, "tokens_per_second_per_gpu": 17522.44, "total_tokens": 23202278 }, { "epoch": 0.014753688422105527, "grad_norm": 1.5218578577041626, "learning_rate": 2.9393370856785493e-06, "loss": 1.2275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 236, "tokens_per_second_per_gpu": 17220.05, "total_tokens": 23303640 }, { "epoch": 0.014816204051012753, "grad_norm": 1.7884365320205688, "learning_rate": 2.9518449030644154e-06, "loss": 1.2439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 237, "tokens_per_second_per_gpu": 16850.15, "total_tokens": 23399511 }, { "epoch": 0.01487871967991998, "grad_norm": 1.580029010772705, "learning_rate": 2.9643527204502814e-06, "loss": 1.2771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 238, "tokens_per_second_per_gpu": 17766.4, "total_tokens": 23499244 }, { "epoch": 0.014941235308827207, "grad_norm": 1.453881859779358, "learning_rate": 2.9768605378361474e-06, "loss": 1.2112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 239, "tokens_per_second_per_gpu": 17401.98, "total_tokens": 23600022 }, { "epoch": 0.015003750937734433, "grad_norm": 1.7694541215896606, "learning_rate": 2.989368355222014e-06, "loss": 1.3282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 240, "tokens_per_second_per_gpu": 18059.73, "total_tokens": 23702199 }, { "epoch": 0.01506626656664166, "grad_norm": 1.6484688520431519, "learning_rate": 3.0018761726078804e-06, "loss": 1.2493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 241, "tokens_per_second_per_gpu": 17008.79, "total_tokens": 23800346 }, { "epoch": 0.015128782195548886, "grad_norm": 1.5169364213943481, "learning_rate": 3.0143839899937464e-06, "loss": 1.2612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 242, "tokens_per_second_per_gpu": 16574.13, "total_tokens": 23899648 }, { "epoch": 0.015191297824456114, "grad_norm": 1.655936360359192, "learning_rate": 3.026891807379613e-06, "loss": 1.2951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 243, "tokens_per_second_per_gpu": 18922.89, "total_tokens": 24003833 }, { "epoch": 0.015253813453363341, "grad_norm": 1.8081159591674805, "learning_rate": 3.039399624765479e-06, "loss": 1.3082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 244, "tokens_per_second_per_gpu": 17640.28, "total_tokens": 24104280 }, { "epoch": 0.015316329082270567, "grad_norm": 1.5002126693725586, "learning_rate": 3.051907442151345e-06, "loss": 1.2064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 245, "tokens_per_second_per_gpu": 17674.65, "total_tokens": 24201691 }, { "epoch": 0.015378844711177795, "grad_norm": 1.6151337623596191, "learning_rate": 3.0644152595372114e-06, "loss": 1.2828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 246, "tokens_per_second_per_gpu": 18487.9, "total_tokens": 24305300 }, { "epoch": 0.015441360340085022, "grad_norm": 1.7425503730773926, "learning_rate": 3.0769230769230774e-06, "loss": 1.2686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 247, "tokens_per_second_per_gpu": 17807.14, "total_tokens": 24404588 }, { "epoch": 0.015503875968992248, "grad_norm": 1.440250277519226, "learning_rate": 3.0894308943089435e-06, "loss": 1.3025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 248, "tokens_per_second_per_gpu": 16704.7, "total_tokens": 24503598 }, { "epoch": 0.015566391597899475, "grad_norm": 1.63728666305542, "learning_rate": 3.1019387116948095e-06, "loss": 1.2888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 249, "tokens_per_second_per_gpu": 17903.27, "total_tokens": 24605810 }, { "epoch": 0.015628907226806703, "grad_norm": 1.462266206741333, "learning_rate": 3.114446529080676e-06, "loss": 1.1954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 250, "tokens_per_second_per_gpu": 17264.86, "total_tokens": 24703936 }, { "epoch": 0.015691422855713927, "grad_norm": 1.5349632501602173, "learning_rate": 3.126954346466542e-06, "loss": 1.2842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 251, "tokens_per_second_per_gpu": 17304.07, "total_tokens": 24803474 }, { "epoch": 0.015753938484621154, "grad_norm": 1.4008419513702393, "learning_rate": 3.139462163852408e-06, "loss": 1.1798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 252, "tokens_per_second_per_gpu": 18410.12, "total_tokens": 24903922 }, { "epoch": 0.015816454113528382, "grad_norm": 1.7323768138885498, "learning_rate": 3.151969981238274e-06, "loss": 1.331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 253, "tokens_per_second_per_gpu": 18231.72, "total_tokens": 25003528 }, { "epoch": 0.01587896974243561, "grad_norm": 1.5328274965286255, "learning_rate": 3.1644777986241405e-06, "loss": 1.2945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 254, "tokens_per_second_per_gpu": 17040.29, "total_tokens": 25097978 }, { "epoch": 0.015941485371342837, "grad_norm": 1.6168408393859863, "learning_rate": 3.1769856160100066e-06, "loss": 1.2818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 255, "tokens_per_second_per_gpu": 17468.8, "total_tokens": 25194837 }, { "epoch": 0.016004001000250064, "grad_norm": 1.421487808227539, "learning_rate": 3.1894934333958726e-06, "loss": 1.246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 256, "tokens_per_second_per_gpu": 18090.86, "total_tokens": 25300656 }, { "epoch": 0.016066516629157288, "grad_norm": 1.4051388502120972, "learning_rate": 3.2020012507817387e-06, "loss": 1.226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 257, "tokens_per_second_per_gpu": 17944.08, "total_tokens": 25401018 }, { "epoch": 0.016129032258064516, "grad_norm": 1.5492585897445679, "learning_rate": 3.214509068167605e-06, "loss": 1.2275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 258, "tokens_per_second_per_gpu": 17076.83, "total_tokens": 25497383 }, { "epoch": 0.016191547886971743, "grad_norm": 1.7859035730361938, "learning_rate": 3.227016885553471e-06, "loss": 1.2604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 259, "tokens_per_second_per_gpu": 18274.17, "total_tokens": 25597659 }, { "epoch": 0.01625406351587897, "grad_norm": 1.648841142654419, "learning_rate": 3.239524702939337e-06, "loss": 1.2609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 260, "tokens_per_second_per_gpu": 18410.39, "total_tokens": 25697076 }, { "epoch": 0.016316579144786198, "grad_norm": 1.5160726308822632, "learning_rate": 3.2520325203252037e-06, "loss": 1.243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 261, "tokens_per_second_per_gpu": 16730.45, "total_tokens": 25794655 }, { "epoch": 0.016379094773693422, "grad_norm": 1.3224061727523804, "learning_rate": 3.2645403377110697e-06, "loss": 1.2055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 262, "tokens_per_second_per_gpu": 18879.65, "total_tokens": 25897264 }, { "epoch": 0.01644161040260065, "grad_norm": 1.3919501304626465, "learning_rate": 3.2770481550969357e-06, "loss": 1.2224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 263, "tokens_per_second_per_gpu": 18496.09, "total_tokens": 26000205 }, { "epoch": 0.016504126031507877, "grad_norm": 1.6605066061019897, "learning_rate": 3.2895559724828018e-06, "loss": 1.2282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 264, "tokens_per_second_per_gpu": 16641.48, "total_tokens": 26097622 }, { "epoch": 0.016566641660415105, "grad_norm": 1.4076168537139893, "learning_rate": 3.3020637898686682e-06, "loss": 1.1914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 265, "tokens_per_second_per_gpu": 16530.36, "total_tokens": 26198591 }, { "epoch": 0.016629157289322332, "grad_norm": 1.4301886558532715, "learning_rate": 3.3145716072545343e-06, "loss": 1.2414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 266, "tokens_per_second_per_gpu": 16985.61, "total_tokens": 26296227 }, { "epoch": 0.016691672918229556, "grad_norm": 1.2597196102142334, "learning_rate": 3.3270794246404003e-06, "loss": 1.2052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 267, "tokens_per_second_per_gpu": 18912.09, "total_tokens": 26399639 }, { "epoch": 0.016754188547136784, "grad_norm": 1.5571476221084595, "learning_rate": 3.3395872420262663e-06, "loss": 1.2364, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 268, "tokens_per_second_per_gpu": 18771.77, "total_tokens": 26503846 }, { "epoch": 0.01681670417604401, "grad_norm": 1.321806788444519, "learning_rate": 3.352095059412133e-06, "loss": 1.205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 269, "tokens_per_second_per_gpu": 17657.8, "total_tokens": 26603910 }, { "epoch": 0.01687921980495124, "grad_norm": 1.4279999732971191, "learning_rate": 3.364602876797999e-06, "loss": 1.1918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 270, "tokens_per_second_per_gpu": 17001.78, "total_tokens": 26701242 }, { "epoch": 0.016941735433858466, "grad_norm": 1.4301433563232422, "learning_rate": 3.377110694183865e-06, "loss": 1.2764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 271, "tokens_per_second_per_gpu": 15488.63, "total_tokens": 26796029 }, { "epoch": 0.01700425106276569, "grad_norm": 1.6233949661254883, "learning_rate": 3.389618511569731e-06, "loss": 1.2398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 272, "tokens_per_second_per_gpu": 18299.12, "total_tokens": 26899664 }, { "epoch": 0.017066766691672917, "grad_norm": 1.3994719982147217, "learning_rate": 3.402126328955598e-06, "loss": 1.1935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 273, "tokens_per_second_per_gpu": 17518.28, "total_tokens": 26998542 }, { "epoch": 0.017129282320580145, "grad_norm": 1.4069571495056152, "learning_rate": 3.414634146341464e-06, "loss": 1.2063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 274, "tokens_per_second_per_gpu": 16415.94, "total_tokens": 27094556 }, { "epoch": 0.017191797949487372, "grad_norm": 1.5069564580917358, "learning_rate": 3.4271419637273303e-06, "loss": 1.2761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 275, "tokens_per_second_per_gpu": 16954.97, "total_tokens": 27189360 }, { "epoch": 0.0172543135783946, "grad_norm": 1.4229686260223389, "learning_rate": 3.4396497811131963e-06, "loss": 1.218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 276, "tokens_per_second_per_gpu": 17539.74, "total_tokens": 27284400 }, { "epoch": 0.017316829207301824, "grad_norm": 1.4264416694641113, "learning_rate": 3.4521575984990624e-06, "loss": 1.2088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 277, "tokens_per_second_per_gpu": 17784.4, "total_tokens": 27387549 }, { "epoch": 0.01737934483620905, "grad_norm": 1.3142004013061523, "learning_rate": 3.4646654158849284e-06, "loss": 1.1962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 278, "tokens_per_second_per_gpu": 15360.52, "total_tokens": 27481814 }, { "epoch": 0.01744186046511628, "grad_norm": 1.4756501913070679, "learning_rate": 3.477173233270795e-06, "loss": 1.1938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 279, "tokens_per_second_per_gpu": 16408.58, "total_tokens": 27577105 }, { "epoch": 0.017504376094023506, "grad_norm": 1.4047743082046509, "learning_rate": 3.489681050656661e-06, "loss": 1.242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 280, "tokens_per_second_per_gpu": 17781.79, "total_tokens": 27676830 }, { "epoch": 0.017566891722930734, "grad_norm": 1.429990291595459, "learning_rate": 3.502188868042527e-06, "loss": 1.1865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 281, "tokens_per_second_per_gpu": 18000.01, "total_tokens": 27773411 }, { "epoch": 0.017629407351837958, "grad_norm": 1.330841302871704, "learning_rate": 3.514696685428393e-06, "loss": 1.1675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 282, "tokens_per_second_per_gpu": 17635.47, "total_tokens": 27871766 }, { "epoch": 0.017691922980745185, "grad_norm": 1.4660507440567017, "learning_rate": 3.5272045028142594e-06, "loss": 1.2351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 283, "tokens_per_second_per_gpu": 17843.67, "total_tokens": 27968866 }, { "epoch": 0.017754438609652413, "grad_norm": 1.2653971910476685, "learning_rate": 3.5397123202001255e-06, "loss": 1.1836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 284, "tokens_per_second_per_gpu": 17350.04, "total_tokens": 28067990 }, { "epoch": 0.01781695423855964, "grad_norm": 1.298018455505371, "learning_rate": 3.5522201375859915e-06, "loss": 1.2032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 285, "tokens_per_second_per_gpu": 16368.33, "total_tokens": 28164275 }, { "epoch": 0.017879469867466868, "grad_norm": 1.3539541959762573, "learning_rate": 3.5647279549718576e-06, "loss": 1.1857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 286, "tokens_per_second_per_gpu": 17105.02, "total_tokens": 28262115 }, { "epoch": 0.017941985496374095, "grad_norm": 1.412809133529663, "learning_rate": 3.577235772357724e-06, "loss": 1.2164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 287, "tokens_per_second_per_gpu": 17803.98, "total_tokens": 28364018 }, { "epoch": 0.01800450112528132, "grad_norm": 1.4194624423980713, "learning_rate": 3.58974358974359e-06, "loss": 1.1613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 288, "tokens_per_second_per_gpu": 17178.0, "total_tokens": 28461892 }, { "epoch": 0.018067016754188547, "grad_norm": 1.24784517288208, "learning_rate": 3.602251407129456e-06, "loss": 1.2033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 289, "tokens_per_second_per_gpu": 17656.63, "total_tokens": 28563636 }, { "epoch": 0.018129532383095774, "grad_norm": 1.2883002758026123, "learning_rate": 3.6147592245153226e-06, "loss": 1.2233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 290, "tokens_per_second_per_gpu": 16738.63, "total_tokens": 28665038 }, { "epoch": 0.018192048012003, "grad_norm": 1.2985868453979492, "learning_rate": 3.6272670419011886e-06, "loss": 1.217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 291, "tokens_per_second_per_gpu": 18274.04, "total_tokens": 28766674 }, { "epoch": 0.01825456364091023, "grad_norm": 1.312909483909607, "learning_rate": 3.6397748592870546e-06, "loss": 1.1118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 292, "tokens_per_second_per_gpu": 16119.86, "total_tokens": 28861274 }, { "epoch": 0.018317079269817453, "grad_norm": 1.355197787284851, "learning_rate": 3.6522826766729207e-06, "loss": 1.168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 293, "tokens_per_second_per_gpu": 15854.0, "total_tokens": 28958729 }, { "epoch": 0.01837959489872468, "grad_norm": 1.3991421461105347, "learning_rate": 3.664790494058787e-06, "loss": 1.1996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 294, "tokens_per_second_per_gpu": 16890.46, "total_tokens": 29054382 }, { "epoch": 0.018442110527631908, "grad_norm": 1.3619589805603027, "learning_rate": 3.677298311444653e-06, "loss": 1.2132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 295, "tokens_per_second_per_gpu": 17757.87, "total_tokens": 29154092 }, { "epoch": 0.018504626156539136, "grad_norm": 1.635970950126648, "learning_rate": 3.689806128830519e-06, "loss": 1.1809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 296, "tokens_per_second_per_gpu": 16961.28, "total_tokens": 29254022 }, { "epoch": 0.018567141785446363, "grad_norm": 1.1854101419448853, "learning_rate": 3.7023139462163852e-06, "loss": 1.1743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 297, "tokens_per_second_per_gpu": 17267.57, "total_tokens": 29354158 }, { "epoch": 0.018629657414353587, "grad_norm": 1.2488892078399658, "learning_rate": 3.7148217636022517e-06, "loss": 1.1617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 298, "tokens_per_second_per_gpu": 18437.4, "total_tokens": 29450232 }, { "epoch": 0.018692173043260814, "grad_norm": 1.2960158586502075, "learning_rate": 3.7273295809881177e-06, "loss": 1.2021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 299, "tokens_per_second_per_gpu": 18566.54, "total_tokens": 29550998 }, { "epoch": 0.018754688672168042, "grad_norm": 1.3213742971420288, "learning_rate": 3.7398373983739838e-06, "loss": 1.2025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 300, "tokens_per_second_per_gpu": 16737.35, "total_tokens": 29650875 }, { "epoch": 0.01881720430107527, "grad_norm": 1.2044235467910767, "learning_rate": 3.75234521575985e-06, "loss": 1.1748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 301, "tokens_per_second_per_gpu": 17389.36, "total_tokens": 29749226 }, { "epoch": 0.018879719929982497, "grad_norm": 1.2700371742248535, "learning_rate": 3.7648530331457163e-06, "loss": 1.193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 302, "tokens_per_second_per_gpu": 17642.78, "total_tokens": 29847690 }, { "epoch": 0.01894223555888972, "grad_norm": 1.2924331426620483, "learning_rate": 3.7773608505315823e-06, "loss": 1.1836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 303, "tokens_per_second_per_gpu": 17120.32, "total_tokens": 29943981 }, { "epoch": 0.01900475118779695, "grad_norm": 1.1950156688690186, "learning_rate": 3.7898686679174484e-06, "loss": 1.1253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 304, "tokens_per_second_per_gpu": 17450.88, "total_tokens": 30044151 }, { "epoch": 0.019067266816704176, "grad_norm": 1.4590364694595337, "learning_rate": 3.8023764853033152e-06, "loss": 1.1916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 305, "tokens_per_second_per_gpu": 16424.23, "total_tokens": 30139458 }, { "epoch": 0.019129782445611403, "grad_norm": 1.2637333869934082, "learning_rate": 3.814884302689181e-06, "loss": 1.1198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 306, "tokens_per_second_per_gpu": 16493.66, "total_tokens": 30237470 }, { "epoch": 0.01919229807451863, "grad_norm": 1.2996165752410889, "learning_rate": 3.827392120075047e-06, "loss": 1.1687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 307, "tokens_per_second_per_gpu": 17584.56, "total_tokens": 30334865 }, { "epoch": 0.019254813703425855, "grad_norm": 1.2152594327926636, "learning_rate": 3.839899937460913e-06, "loss": 1.1754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 308, "tokens_per_second_per_gpu": 17651.69, "total_tokens": 30434134 }, { "epoch": 0.019317329332333082, "grad_norm": 1.4194374084472656, "learning_rate": 3.852407754846779e-06, "loss": 1.1852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 309, "tokens_per_second_per_gpu": 17519.16, "total_tokens": 30533051 }, { "epoch": 0.01937984496124031, "grad_norm": 1.2719546556472778, "learning_rate": 3.864915572232646e-06, "loss": 1.2179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 310, "tokens_per_second_per_gpu": 18306.01, "total_tokens": 30634948 }, { "epoch": 0.019442360590147537, "grad_norm": 1.2259068489074707, "learning_rate": 3.877423389618512e-06, "loss": 1.2027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 311, "tokens_per_second_per_gpu": 18809.55, "total_tokens": 30740826 }, { "epoch": 0.019504876219054765, "grad_norm": 1.2646312713623047, "learning_rate": 3.889931207004378e-06, "loss": 1.1188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 312, "tokens_per_second_per_gpu": 16786.15, "total_tokens": 30841131 }, { "epoch": 0.01956739184796199, "grad_norm": 1.255942702293396, "learning_rate": 3.902439024390244e-06, "loss": 1.183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 313, "tokens_per_second_per_gpu": 17168.03, "total_tokens": 30941064 }, { "epoch": 0.019629907476869216, "grad_norm": 1.5397518873214722, "learning_rate": 3.9149468417761104e-06, "loss": 1.1445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 314, "tokens_per_second_per_gpu": 17587.26, "total_tokens": 31039029 }, { "epoch": 0.019692423105776444, "grad_norm": 1.2150801420211792, "learning_rate": 3.9274546591619765e-06, "loss": 1.18, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 315, "tokens_per_second_per_gpu": 17545.59, "total_tokens": 31138150 }, { "epoch": 0.01975493873468367, "grad_norm": 1.232177734375, "learning_rate": 3.9399624765478425e-06, "loss": 1.1494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 316, "tokens_per_second_per_gpu": 16988.86, "total_tokens": 31239182 }, { "epoch": 0.0198174543635909, "grad_norm": 1.2029645442962646, "learning_rate": 3.9524702939337085e-06, "loss": 1.2096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 317, "tokens_per_second_per_gpu": 17962.23, "total_tokens": 31342424 }, { "epoch": 0.019879969992498126, "grad_norm": 1.2511718273162842, "learning_rate": 3.964978111319575e-06, "loss": 1.1382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 318, "tokens_per_second_per_gpu": 17648.3, "total_tokens": 31441434 }, { "epoch": 0.01994248562140535, "grad_norm": 1.188072919845581, "learning_rate": 3.9774859287054415e-06, "loss": 1.1317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 319, "tokens_per_second_per_gpu": 16870.16, "total_tokens": 31540188 }, { "epoch": 0.020005001250312578, "grad_norm": 1.2442004680633545, "learning_rate": 3.9899937460913075e-06, "loss": 1.202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 320, "tokens_per_second_per_gpu": 17751.28, "total_tokens": 31642143 }, { "epoch": 0.020067516879219805, "grad_norm": 1.3424253463745117, "learning_rate": 4.0025015634771735e-06, "loss": 1.1837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 321, "tokens_per_second_per_gpu": 17939.48, "total_tokens": 31742900 }, { "epoch": 0.020130032508127033, "grad_norm": 1.200818419456482, "learning_rate": 4.0150093808630396e-06, "loss": 1.1645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 322, "tokens_per_second_per_gpu": 16525.13, "total_tokens": 31839230 }, { "epoch": 0.02019254813703426, "grad_norm": 1.2576993703842163, "learning_rate": 4.027517198248906e-06, "loss": 1.1453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 323, "tokens_per_second_per_gpu": 18150.79, "total_tokens": 31940353 }, { "epoch": 0.020255063765941484, "grad_norm": 1.1831344366073608, "learning_rate": 4.040025015634772e-06, "loss": 1.1489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 324, "tokens_per_second_per_gpu": 17305.1, "total_tokens": 32042465 }, { "epoch": 0.02031757939484871, "grad_norm": 1.2246500253677368, "learning_rate": 4.0525328330206385e-06, "loss": 1.1524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 325, "tokens_per_second_per_gpu": 18336.07, "total_tokens": 32144722 }, { "epoch": 0.02038009502375594, "grad_norm": 1.1908090114593506, "learning_rate": 4.0650406504065046e-06, "loss": 1.135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 326, "tokens_per_second_per_gpu": 17192.71, "total_tokens": 32242081 }, { "epoch": 0.020442610652663167, "grad_norm": 1.3204774856567383, "learning_rate": 4.077548467792371e-06, "loss": 1.1842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 327, "tokens_per_second_per_gpu": 16698.81, "total_tokens": 32340157 }, { "epoch": 0.020505126281570394, "grad_norm": 1.1780369281768799, "learning_rate": 4.090056285178237e-06, "loss": 1.1624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 328, "tokens_per_second_per_gpu": 18047.88, "total_tokens": 32439875 }, { "epoch": 0.020567641910477618, "grad_norm": 1.283573865890503, "learning_rate": 4.102564102564103e-06, "loss": 1.1742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 329, "tokens_per_second_per_gpu": 17475.6, "total_tokens": 32538954 }, { "epoch": 0.020630157539384845, "grad_norm": 1.4745992422103882, "learning_rate": 4.115071919949969e-06, "loss": 1.1532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 330, "tokens_per_second_per_gpu": 16441.53, "total_tokens": 32637751 }, { "epoch": 0.020692673168292073, "grad_norm": 1.213923454284668, "learning_rate": 4.127579737335835e-06, "loss": 1.1551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 331, "tokens_per_second_per_gpu": 16761.29, "total_tokens": 32735511 }, { "epoch": 0.0207551887971993, "grad_norm": 1.3494797945022583, "learning_rate": 4.140087554721701e-06, "loss": 1.1933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 332, "tokens_per_second_per_gpu": 18516.53, "total_tokens": 32833580 }, { "epoch": 0.020817704426106528, "grad_norm": 1.2084940671920776, "learning_rate": 4.152595372107568e-06, "loss": 1.1102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 333, "tokens_per_second_per_gpu": 16902.25, "total_tokens": 32931194 }, { "epoch": 0.020880220055013752, "grad_norm": 1.2333128452301025, "learning_rate": 4.165103189493434e-06, "loss": 1.1717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 334, "tokens_per_second_per_gpu": 16351.09, "total_tokens": 33029392 }, { "epoch": 0.02094273568392098, "grad_norm": 1.3755663633346558, "learning_rate": 4.1776110068793e-06, "loss": 1.2052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 335, "tokens_per_second_per_gpu": 17540.19, "total_tokens": 33129211 }, { "epoch": 0.021005251312828207, "grad_norm": 1.1916799545288086, "learning_rate": 4.190118824265166e-06, "loss": 1.1694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 336, "tokens_per_second_per_gpu": 18345.94, "total_tokens": 33232386 }, { "epoch": 0.021067766941735434, "grad_norm": 1.2022603750228882, "learning_rate": 4.202626641651033e-06, "loss": 1.2204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 337, "tokens_per_second_per_gpu": 19167.96, "total_tokens": 33340718 }, { "epoch": 0.021130282570642662, "grad_norm": 1.2725534439086914, "learning_rate": 4.215134459036899e-06, "loss": 1.1862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 338, "tokens_per_second_per_gpu": 18145.93, "total_tokens": 33442096 }, { "epoch": 0.021192798199549886, "grad_norm": 1.245975375175476, "learning_rate": 4.227642276422765e-06, "loss": 1.102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 339, "tokens_per_second_per_gpu": 16880.47, "total_tokens": 33540478 }, { "epoch": 0.021255313828457113, "grad_norm": 1.3100895881652832, "learning_rate": 4.240150093808631e-06, "loss": 1.1598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 340, "tokens_per_second_per_gpu": 17146.9, "total_tokens": 33640077 }, { "epoch": 0.02131782945736434, "grad_norm": 1.215361475944519, "learning_rate": 4.252657911194497e-06, "loss": 1.1655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 341, "tokens_per_second_per_gpu": 17206.86, "total_tokens": 33734965 }, { "epoch": 0.02138034508627157, "grad_norm": 1.1635487079620361, "learning_rate": 4.265165728580363e-06, "loss": 1.0723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 342, "tokens_per_second_per_gpu": 16598.2, "total_tokens": 33834220 }, { "epoch": 0.021442860715178796, "grad_norm": 1.3058218955993652, "learning_rate": 4.27767354596623e-06, "loss": 1.126, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 343, "tokens_per_second_per_gpu": 16738.22, "total_tokens": 33929784 }, { "epoch": 0.021505376344086023, "grad_norm": 1.199449062347412, "learning_rate": 4.290181363352096e-06, "loss": 1.1536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 344, "tokens_per_second_per_gpu": 16342.32, "total_tokens": 34027185 }, { "epoch": 0.021567891972993247, "grad_norm": 1.3158997297286987, "learning_rate": 4.302689180737962e-06, "loss": 1.2198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 345, "tokens_per_second_per_gpu": 17140.89, "total_tokens": 34125168 }, { "epoch": 0.021630407601900475, "grad_norm": 1.2614026069641113, "learning_rate": 4.315196998123828e-06, "loss": 1.1533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 346, "tokens_per_second_per_gpu": 17245.01, "total_tokens": 34221207 }, { "epoch": 0.021692923230807702, "grad_norm": 1.2074127197265625, "learning_rate": 4.327704815509694e-06, "loss": 1.1027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 347, "tokens_per_second_per_gpu": 16092.87, "total_tokens": 34318791 }, { "epoch": 0.02175543885971493, "grad_norm": 1.191852331161499, "learning_rate": 4.34021263289556e-06, "loss": 1.0886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 348, "tokens_per_second_per_gpu": 16803.38, "total_tokens": 34416504 }, { "epoch": 0.021817954488622157, "grad_norm": 1.22170889377594, "learning_rate": 4.352720450281426e-06, "loss": 1.1648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 349, "tokens_per_second_per_gpu": 17585.58, "total_tokens": 34517345 }, { "epoch": 0.02188047011752938, "grad_norm": 1.3644459247589111, "learning_rate": 4.365228267667293e-06, "loss": 1.1746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 350, "tokens_per_second_per_gpu": 17633.81, "total_tokens": 34615560 }, { "epoch": 0.02194298574643661, "grad_norm": 1.126873254776001, "learning_rate": 4.377736085053159e-06, "loss": 1.1675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 351, "tokens_per_second_per_gpu": 18922.4, "total_tokens": 34719504 }, { "epoch": 0.022005501375343836, "grad_norm": 1.1384711265563965, "learning_rate": 4.390243902439025e-06, "loss": 1.1288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 352, "tokens_per_second_per_gpu": 17816.43, "total_tokens": 34822365 }, { "epoch": 0.022068017004251064, "grad_norm": 1.252061367034912, "learning_rate": 4.402751719824891e-06, "loss": 1.1668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 353, "tokens_per_second_per_gpu": 18362.99, "total_tokens": 34921729 }, { "epoch": 0.02213053263315829, "grad_norm": 1.2371101379394531, "learning_rate": 4.415259537210757e-06, "loss": 1.1184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 354, "tokens_per_second_per_gpu": 16529.63, "total_tokens": 35019018 }, { "epoch": 0.022193048262065515, "grad_norm": 1.286365270614624, "learning_rate": 4.427767354596623e-06, "loss": 1.1658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 355, "tokens_per_second_per_gpu": 17061.88, "total_tokens": 35113363 }, { "epoch": 0.022255563890972743, "grad_norm": 1.1560295820236206, "learning_rate": 4.440275171982489e-06, "loss": 1.1138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 356, "tokens_per_second_per_gpu": 17904.73, "total_tokens": 35213608 }, { "epoch": 0.02231807951987997, "grad_norm": 1.161531925201416, "learning_rate": 4.452782989368355e-06, "loss": 1.1299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 357, "tokens_per_second_per_gpu": 17991.39, "total_tokens": 35317478 }, { "epoch": 0.022380595148787197, "grad_norm": 1.2307202816009521, "learning_rate": 4.465290806754222e-06, "loss": 1.1322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 358, "tokens_per_second_per_gpu": 17647.99, "total_tokens": 35417915 }, { "epoch": 0.022443110777694425, "grad_norm": 1.254777193069458, "learning_rate": 4.477798624140088e-06, "loss": 1.165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 359, "tokens_per_second_per_gpu": 16996.7, "total_tokens": 35514905 }, { "epoch": 0.02250562640660165, "grad_norm": 1.2480086088180542, "learning_rate": 4.490306441525954e-06, "loss": 1.1797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 360, "tokens_per_second_per_gpu": 16850.68, "total_tokens": 35612905 }, { "epoch": 0.022568142035508876, "grad_norm": 1.225554347038269, "learning_rate": 4.50281425891182e-06, "loss": 1.1594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 361, "tokens_per_second_per_gpu": 18290.55, "total_tokens": 35714017 }, { "epoch": 0.022630657664416104, "grad_norm": 1.3240810632705688, "learning_rate": 4.515322076297686e-06, "loss": 1.1735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 362, "tokens_per_second_per_gpu": 18237.3, "total_tokens": 35815848 }, { "epoch": 0.02269317329332333, "grad_norm": 1.1434775590896606, "learning_rate": 4.527829893683552e-06, "loss": 1.1261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 363, "tokens_per_second_per_gpu": 18058.41, "total_tokens": 35920338 }, { "epoch": 0.02275568892223056, "grad_norm": 1.160567283630371, "learning_rate": 4.540337711069418e-06, "loss": 1.1771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 364, "tokens_per_second_per_gpu": 17639.08, "total_tokens": 36022224 }, { "epoch": 0.022818204551137783, "grad_norm": 1.289616584777832, "learning_rate": 4.552845528455285e-06, "loss": 1.1616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 365, "tokens_per_second_per_gpu": 18160.27, "total_tokens": 36123536 }, { "epoch": 0.02288072018004501, "grad_norm": 1.2361809015274048, "learning_rate": 4.565353345841151e-06, "loss": 1.1145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 366, "tokens_per_second_per_gpu": 16299.24, "total_tokens": 36216164 }, { "epoch": 0.022943235808952238, "grad_norm": 1.2663764953613281, "learning_rate": 4.577861163227017e-06, "loss": 1.1244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 367, "tokens_per_second_per_gpu": 15788.37, "total_tokens": 36309035 }, { "epoch": 0.023005751437859465, "grad_norm": 1.183398962020874, "learning_rate": 4.590368980612883e-06, "loss": 1.142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 368, "tokens_per_second_per_gpu": 17815.71, "total_tokens": 36412678 }, { "epoch": 0.023068267066766693, "grad_norm": 1.1522977352142334, "learning_rate": 4.60287679799875e-06, "loss": 1.1773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 369, "tokens_per_second_per_gpu": 17437.15, "total_tokens": 36512833 }, { "epoch": 0.023130782695673917, "grad_norm": 1.2953269481658936, "learning_rate": 4.615384615384616e-06, "loss": 1.163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 370, "tokens_per_second_per_gpu": 17545.24, "total_tokens": 36612008 }, { "epoch": 0.023193298324581144, "grad_norm": 1.197845458984375, "learning_rate": 4.627892432770482e-06, "loss": 1.1441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 371, "tokens_per_second_per_gpu": 17553.64, "total_tokens": 36712493 }, { "epoch": 0.023255813953488372, "grad_norm": 1.172309160232544, "learning_rate": 4.640400250156348e-06, "loss": 1.1345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 372, "tokens_per_second_per_gpu": 18362.97, "total_tokens": 36812834 }, { "epoch": 0.0233183295823956, "grad_norm": 1.2604409456253052, "learning_rate": 4.652908067542214e-06, "loss": 1.085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 373, "tokens_per_second_per_gpu": 16487.98, "total_tokens": 36911366 }, { "epoch": 0.023380845211302827, "grad_norm": 1.164414644241333, "learning_rate": 4.66541588492808e-06, "loss": 1.1296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 374, "tokens_per_second_per_gpu": 18754.28, "total_tokens": 37012806 }, { "epoch": 0.023443360840210054, "grad_norm": 1.1535838842391968, "learning_rate": 4.677923702313947e-06, "loss": 1.067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 375, "tokens_per_second_per_gpu": 17386.09, "total_tokens": 37115368 }, { "epoch": 0.023505876469117278, "grad_norm": 1.16624915599823, "learning_rate": 4.690431519699813e-06, "loss": 1.1234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 376, "tokens_per_second_per_gpu": 16475.24, "total_tokens": 37213054 }, { "epoch": 0.023568392098024506, "grad_norm": 1.224691390991211, "learning_rate": 4.702939337085679e-06, "loss": 1.1299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 377, "tokens_per_second_per_gpu": 17634.84, "total_tokens": 37312043 }, { "epoch": 0.023630907726931733, "grad_norm": 1.438910722732544, "learning_rate": 4.715447154471545e-06, "loss": 1.1441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 378, "tokens_per_second_per_gpu": 16804.84, "total_tokens": 37411431 }, { "epoch": 0.02369342335583896, "grad_norm": 1.1220529079437256, "learning_rate": 4.727954971857411e-06, "loss": 1.127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 379, "tokens_per_second_per_gpu": 17998.33, "total_tokens": 37513552 }, { "epoch": 0.023755938984746188, "grad_norm": 1.220204472541809, "learning_rate": 4.740462789243277e-06, "loss": 1.1123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 380, "tokens_per_second_per_gpu": 16701.79, "total_tokens": 37609386 }, { "epoch": 0.023818454613653412, "grad_norm": 1.2908369302749634, "learning_rate": 4.752970606629143e-06, "loss": 1.1494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 381, "tokens_per_second_per_gpu": 17282.79, "total_tokens": 37709181 }, { "epoch": 0.02388097024256064, "grad_norm": 1.3007819652557373, "learning_rate": 4.7654784240150095e-06, "loss": 1.1752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 382, "tokens_per_second_per_gpu": 17938.62, "total_tokens": 37807802 }, { "epoch": 0.023943485871467867, "grad_norm": 1.1446199417114258, "learning_rate": 4.777986241400876e-06, "loss": 1.1545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 383, "tokens_per_second_per_gpu": 17144.04, "total_tokens": 37910676 }, { "epoch": 0.024006001500375095, "grad_norm": 1.1818609237670898, "learning_rate": 4.790494058786742e-06, "loss": 1.0831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 384, "tokens_per_second_per_gpu": 17192.33, "total_tokens": 38010044 }, { "epoch": 0.024068517129282322, "grad_norm": 2.971224546432495, "learning_rate": 4.803001876172608e-06, "loss": 1.1307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 385, "tokens_per_second_per_gpu": 17322.37, "total_tokens": 38110553 }, { "epoch": 0.024131032758189546, "grad_norm": 1.1304250955581665, "learning_rate": 4.8155096935584744e-06, "loss": 1.0942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 386, "tokens_per_second_per_gpu": 17482.86, "total_tokens": 38209008 }, { "epoch": 0.024193548387096774, "grad_norm": 1.1653093099594116, "learning_rate": 4.8280175109443405e-06, "loss": 1.0822, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 387, "tokens_per_second_per_gpu": 17019.37, "total_tokens": 38306229 }, { "epoch": 0.024256064016004, "grad_norm": 1.1922316551208496, "learning_rate": 4.8405253283302065e-06, "loss": 1.1533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 388, "tokens_per_second_per_gpu": 16778.38, "total_tokens": 38403913 }, { "epoch": 0.02431857964491123, "grad_norm": 1.157018780708313, "learning_rate": 4.8530331457160726e-06, "loss": 1.1405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 389, "tokens_per_second_per_gpu": 18132.1, "total_tokens": 38505710 }, { "epoch": 0.024381095273818456, "grad_norm": 1.535476565361023, "learning_rate": 4.865540963101939e-06, "loss": 1.0819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 390, "tokens_per_second_per_gpu": 16515.16, "total_tokens": 38603441 }, { "epoch": 0.02444361090272568, "grad_norm": 1.1490912437438965, "learning_rate": 4.8780487804878055e-06, "loss": 1.1423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 391, "tokens_per_second_per_gpu": 18154.64, "total_tokens": 38703801 }, { "epoch": 0.024506126531632907, "grad_norm": 1.1435693502426147, "learning_rate": 4.8905565978736715e-06, "loss": 1.1507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 392, "tokens_per_second_per_gpu": 17244.84, "total_tokens": 38802748 }, { "epoch": 0.024568642160540135, "grad_norm": 1.1022372245788574, "learning_rate": 4.9030644152595376e-06, "loss": 1.1476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 393, "tokens_per_second_per_gpu": 18134.18, "total_tokens": 38905605 }, { "epoch": 0.024631157789447362, "grad_norm": 1.1567150354385376, "learning_rate": 4.915572232645404e-06, "loss": 1.1392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 394, "tokens_per_second_per_gpu": 18345.48, "total_tokens": 39008026 }, { "epoch": 0.02469367341835459, "grad_norm": 1.1553410291671753, "learning_rate": 4.92808005003127e-06, "loss": 1.1558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 395, "tokens_per_second_per_gpu": 17079.2, "total_tokens": 39106686 }, { "epoch": 0.024756189047261814, "grad_norm": 1.4112738370895386, "learning_rate": 4.940587867417136e-06, "loss": 1.143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 396, "tokens_per_second_per_gpu": 17240.64, "total_tokens": 39204641 }, { "epoch": 0.02481870467616904, "grad_norm": 1.1289515495300293, "learning_rate": 4.953095684803002e-06, "loss": 1.1042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 397, "tokens_per_second_per_gpu": 17756.34, "total_tokens": 39306900 }, { "epoch": 0.02488122030507627, "grad_norm": 1.320482850074768, "learning_rate": 4.965603502188869e-06, "loss": 1.1324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 398, "tokens_per_second_per_gpu": 16934.53, "total_tokens": 39401870 }, { "epoch": 0.024943735933983496, "grad_norm": 1.128136396408081, "learning_rate": 4.978111319574735e-06, "loss": 1.1128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 399, "tokens_per_second_per_gpu": 18432.96, "total_tokens": 39505269 }, { "epoch": 0.025006251562890724, "grad_norm": 1.1487321853637695, "learning_rate": 4.990619136960601e-06, "loss": 1.1307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 400, "tokens_per_second_per_gpu": 17921.56, "total_tokens": 39607202 }, { "epoch": 0.025068767191797948, "grad_norm": 1.1834073066711426, "learning_rate": 5.003126954346467e-06, "loss": 1.0982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 401, "tokens_per_second_per_gpu": 17148.32, "total_tokens": 39705874 }, { "epoch": 0.025131282820705175, "grad_norm": 1.1825181245803833, "learning_rate": 5.015634771732333e-06, "loss": 1.1074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 402, "tokens_per_second_per_gpu": 17204.3, "total_tokens": 39801541 }, { "epoch": 0.025193798449612403, "grad_norm": 1.1013628244400024, "learning_rate": 5.028142589118199e-06, "loss": 1.1233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 403, "tokens_per_second_per_gpu": 17940.82, "total_tokens": 39901051 }, { "epoch": 0.02525631407851963, "grad_norm": 1.1751502752304077, "learning_rate": 5.040650406504065e-06, "loss": 1.1154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 404, "tokens_per_second_per_gpu": 17278.34, "total_tokens": 39996758 }, { "epoch": 0.025318829707426858, "grad_norm": 1.190558910369873, "learning_rate": 5.053158223889931e-06, "loss": 1.1117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 405, "tokens_per_second_per_gpu": 17327.21, "total_tokens": 40095562 }, { "epoch": 0.025381345336334085, "grad_norm": 1.2090810537338257, "learning_rate": 5.065666041275798e-06, "loss": 1.0904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 406, "tokens_per_second_per_gpu": 18064.26, "total_tokens": 40193773 }, { "epoch": 0.02544386096524131, "grad_norm": 1.1830068826675415, "learning_rate": 5.078173858661664e-06, "loss": 1.1402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 407, "tokens_per_second_per_gpu": 17560.05, "total_tokens": 40293565 }, { "epoch": 0.025506376594148537, "grad_norm": 1.140527606010437, "learning_rate": 5.09068167604753e-06, "loss": 1.0895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 408, "tokens_per_second_per_gpu": 16679.9, "total_tokens": 40387077 }, { "epoch": 0.025568892223055764, "grad_norm": 1.1551176309585571, "learning_rate": 5.103189493433396e-06, "loss": 1.0539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 409, "tokens_per_second_per_gpu": 17211.72, "total_tokens": 40484052 }, { "epoch": 0.02563140785196299, "grad_norm": 1.1141749620437622, "learning_rate": 5.115697310819262e-06, "loss": 1.0769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 410, "tokens_per_second_per_gpu": 17557.45, "total_tokens": 40583010 }, { "epoch": 0.02569392348087022, "grad_norm": 1.1578737497329712, "learning_rate": 5.128205128205128e-06, "loss": 1.1704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 411, "tokens_per_second_per_gpu": 18502.65, "total_tokens": 40688074 }, { "epoch": 0.025756439109777443, "grad_norm": 1.1162476539611816, "learning_rate": 5.140712945590994e-06, "loss": 1.0969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 412, "tokens_per_second_per_gpu": 18846.96, "total_tokens": 40788693 }, { "epoch": 0.02581895473868467, "grad_norm": 1.0848313570022583, "learning_rate": 5.153220762976861e-06, "loss": 1.0757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 413, "tokens_per_second_per_gpu": 17876.67, "total_tokens": 40890879 }, { "epoch": 0.025881470367591898, "grad_norm": 1.866152286529541, "learning_rate": 5.165728580362727e-06, "loss": 1.1726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 414, "tokens_per_second_per_gpu": 17532.66, "total_tokens": 40991373 }, { "epoch": 0.025943985996499126, "grad_norm": 1.168462872505188, "learning_rate": 5.178236397748593e-06, "loss": 1.0606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 415, "tokens_per_second_per_gpu": 16826.94, "total_tokens": 41086348 }, { "epoch": 0.026006501625406353, "grad_norm": 1.4217463731765747, "learning_rate": 5.190744215134459e-06, "loss": 1.132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 416, "tokens_per_second_per_gpu": 18351.6, "total_tokens": 41187336 }, { "epoch": 0.026069017254313577, "grad_norm": 1.1724086999893188, "learning_rate": 5.203252032520326e-06, "loss": 1.1264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 417, "tokens_per_second_per_gpu": 18106.21, "total_tokens": 41288352 }, { "epoch": 0.026131532883220805, "grad_norm": 1.0871012210845947, "learning_rate": 5.215759849906193e-06, "loss": 1.1212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 418, "tokens_per_second_per_gpu": 18360.9, "total_tokens": 41392765 }, { "epoch": 0.026194048512128032, "grad_norm": 1.1003376245498657, "learning_rate": 5.228267667292059e-06, "loss": 1.0719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 419, "tokens_per_second_per_gpu": 17858.5, "total_tokens": 41493422 }, { "epoch": 0.02625656414103526, "grad_norm": 1.1549146175384521, "learning_rate": 5.240775484677925e-06, "loss": 1.1392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 420, "tokens_per_second_per_gpu": 16543.8, "total_tokens": 41593142 }, { "epoch": 0.026319079769942487, "grad_norm": 1.0894614458084106, "learning_rate": 5.253283302063791e-06, "loss": 1.0335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 421, "tokens_per_second_per_gpu": 16208.77, "total_tokens": 41690372 }, { "epoch": 0.02638159539884971, "grad_norm": 1.1260274648666382, "learning_rate": 5.265791119449657e-06, "loss": 1.0844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 422, "tokens_per_second_per_gpu": 18241.57, "total_tokens": 41790527 }, { "epoch": 0.02644411102775694, "grad_norm": 1.380476474761963, "learning_rate": 5.278298936835523e-06, "loss": 1.1072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 423, "tokens_per_second_per_gpu": 17476.53, "total_tokens": 41888666 }, { "epoch": 0.026506626656664166, "grad_norm": 1.0675349235534668, "learning_rate": 5.290806754221389e-06, "loss": 1.0729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 424, "tokens_per_second_per_gpu": 16673.25, "total_tokens": 41987253 }, { "epoch": 0.026569142285571393, "grad_norm": 1.23508620262146, "learning_rate": 5.303314571607255e-06, "loss": 1.1333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 425, "tokens_per_second_per_gpu": 17594.7, "total_tokens": 42085892 }, { "epoch": 0.02663165791447862, "grad_norm": 1.2956326007843018, "learning_rate": 5.315822388993122e-06, "loss": 1.1262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 426, "tokens_per_second_per_gpu": 16753.32, "total_tokens": 42184718 }, { "epoch": 0.026694173543385845, "grad_norm": 1.1060891151428223, "learning_rate": 5.328330206378988e-06, "loss": 1.1254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 427, "tokens_per_second_per_gpu": 17917.91, "total_tokens": 42288558 }, { "epoch": 0.026756689172293072, "grad_norm": 1.0933600664138794, "learning_rate": 5.340838023764854e-06, "loss": 1.0633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 428, "tokens_per_second_per_gpu": 17243.39, "total_tokens": 42388194 }, { "epoch": 0.0268192048012003, "grad_norm": 1.1794626712799072, "learning_rate": 5.35334584115072e-06, "loss": 1.1161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 429, "tokens_per_second_per_gpu": 16863.45, "total_tokens": 42485774 }, { "epoch": 0.026881720430107527, "grad_norm": 1.1259328126907349, "learning_rate": 5.365853658536586e-06, "loss": 1.0876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 430, "tokens_per_second_per_gpu": 17954.68, "total_tokens": 42587435 }, { "epoch": 0.026944236059014755, "grad_norm": 1.4116921424865723, "learning_rate": 5.378361475922452e-06, "loss": 1.1251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 431, "tokens_per_second_per_gpu": 16852.55, "total_tokens": 42684899 }, { "epoch": 0.02700675168792198, "grad_norm": 1.166184663772583, "learning_rate": 5.390869293308318e-06, "loss": 1.0974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 432, "tokens_per_second_per_gpu": 16665.68, "total_tokens": 42780344 }, { "epoch": 0.027069267316829206, "grad_norm": 1.1210639476776123, "learning_rate": 5.403377110694185e-06, "loss": 1.0412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 433, "tokens_per_second_per_gpu": 18256.04, "total_tokens": 42881149 }, { "epoch": 0.027131782945736434, "grad_norm": 1.1293693780899048, "learning_rate": 5.415884928080051e-06, "loss": 1.097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 434, "tokens_per_second_per_gpu": 18029.05, "total_tokens": 42978429 }, { "epoch": 0.02719429857464366, "grad_norm": 1.074796438217163, "learning_rate": 5.428392745465917e-06, "loss": 1.0614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 435, "tokens_per_second_per_gpu": 16969.23, "total_tokens": 43078381 }, { "epoch": 0.02725681420355089, "grad_norm": 1.3238227367401123, "learning_rate": 5.440900562851783e-06, "loss": 1.0978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 436, "tokens_per_second_per_gpu": 16634.06, "total_tokens": 43175523 }, { "epoch": 0.027319329832458116, "grad_norm": 1.1430975198745728, "learning_rate": 5.453408380237649e-06, "loss": 1.1309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 437, "tokens_per_second_per_gpu": 18618.67, "total_tokens": 43276475 }, { "epoch": 0.02738184546136534, "grad_norm": 1.0817434787750244, "learning_rate": 5.465916197623515e-06, "loss": 1.0399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 438, "tokens_per_second_per_gpu": 17860.42, "total_tokens": 43377247 }, { "epoch": 0.027444361090272568, "grad_norm": 1.1479411125183105, "learning_rate": 5.478424015009381e-06, "loss": 1.0803, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 439, "tokens_per_second_per_gpu": 16186.44, "total_tokens": 43474187 }, { "epoch": 0.027506876719179795, "grad_norm": 1.159415602684021, "learning_rate": 5.490931832395247e-06, "loss": 1.1328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 440, "tokens_per_second_per_gpu": 17865.48, "total_tokens": 43576504 }, { "epoch": 0.027569392348087023, "grad_norm": 1.1234406232833862, "learning_rate": 5.503439649781114e-06, "loss": 1.0895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 441, "tokens_per_second_per_gpu": 17855.02, "total_tokens": 43673586 }, { "epoch": 0.02763190797699425, "grad_norm": 1.1207685470581055, "learning_rate": 5.51594746716698e-06, "loss": 1.0316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 442, "tokens_per_second_per_gpu": 16357.09, "total_tokens": 43767567 }, { "epoch": 0.027694423605901474, "grad_norm": 1.0950212478637695, "learning_rate": 5.528455284552846e-06, "loss": 1.0743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 443, "tokens_per_second_per_gpu": 18333.74, "total_tokens": 43867221 }, { "epoch": 0.0277569392348087, "grad_norm": 1.3547327518463135, "learning_rate": 5.540963101938712e-06, "loss": 1.1142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 444, "tokens_per_second_per_gpu": 17156.29, "total_tokens": 43968207 }, { "epoch": 0.02781945486371593, "grad_norm": 1.0888686180114746, "learning_rate": 5.553470919324578e-06, "loss": 1.0399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 445, "tokens_per_second_per_gpu": 15889.27, "total_tokens": 44063305 }, { "epoch": 0.027881970492623157, "grad_norm": 1.0880528688430786, "learning_rate": 5.565978736710444e-06, "loss": 1.1147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 446, "tokens_per_second_per_gpu": 17242.99, "total_tokens": 44160464 }, { "epoch": 0.027944486121530384, "grad_norm": 1.0839449167251587, "learning_rate": 5.57848655409631e-06, "loss": 1.0592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 447, "tokens_per_second_per_gpu": 16132.41, "total_tokens": 44257993 }, { "epoch": 0.028007001750437608, "grad_norm": 1.1051559448242188, "learning_rate": 5.590994371482177e-06, "loss": 1.1188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 448, "tokens_per_second_per_gpu": 18271.74, "total_tokens": 44359268 }, { "epoch": 0.028069517379344835, "grad_norm": 1.126473307609558, "learning_rate": 5.603502188868043e-06, "loss": 1.0821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 449, "tokens_per_second_per_gpu": 16195.39, "total_tokens": 44454251 }, { "epoch": 0.028132033008252063, "grad_norm": 1.0547598600387573, "learning_rate": 5.616010006253909e-06, "loss": 1.112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 450, "tokens_per_second_per_gpu": 19132.49, "total_tokens": 44561143 }, { "epoch": 0.02819454863715929, "grad_norm": 1.1422909498214722, "learning_rate": 5.628517823639775e-06, "loss": 1.0992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 451, "tokens_per_second_per_gpu": 17570.66, "total_tokens": 44661536 }, { "epoch": 0.028257064266066518, "grad_norm": 1.0661721229553223, "learning_rate": 5.641025641025641e-06, "loss": 1.0908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 452, "tokens_per_second_per_gpu": 17431.7, "total_tokens": 44762147 }, { "epoch": 0.028319579894973742, "grad_norm": 1.2301831245422363, "learning_rate": 5.6535334584115074e-06, "loss": 1.1128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 453, "tokens_per_second_per_gpu": 16374.85, "total_tokens": 44858429 }, { "epoch": 0.02838209552388097, "grad_norm": 1.1378166675567627, "learning_rate": 5.6660412757973735e-06, "loss": 1.0976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 454, "tokens_per_second_per_gpu": 18204.91, "total_tokens": 44957471 }, { "epoch": 0.028444611152788197, "grad_norm": 1.169412612915039, "learning_rate": 5.6785490931832395e-06, "loss": 1.0708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 455, "tokens_per_second_per_gpu": 17311.88, "total_tokens": 45056167 }, { "epoch": 0.028507126781695424, "grad_norm": 1.1593307256698608, "learning_rate": 5.691056910569106e-06, "loss": 1.0236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 456, "tokens_per_second_per_gpu": 17079.66, "total_tokens": 45151048 }, { "epoch": 0.028569642410602652, "grad_norm": 1.0998979806900024, "learning_rate": 5.7035647279549724e-06, "loss": 1.0786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 457, "tokens_per_second_per_gpu": 18627.29, "total_tokens": 45253331 }, { "epoch": 0.028632158039509876, "grad_norm": 1.0808433294296265, "learning_rate": 5.7160725453408385e-06, "loss": 1.1312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 458, "tokens_per_second_per_gpu": 18252.77, "total_tokens": 45354645 }, { "epoch": 0.028694673668417103, "grad_norm": 1.036213755607605, "learning_rate": 5.7285803627267045e-06, "loss": 1.0429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 459, "tokens_per_second_per_gpu": 17019.71, "total_tokens": 45453056 }, { "epoch": 0.02875718929732433, "grad_norm": 1.0669145584106445, "learning_rate": 5.7410881801125705e-06, "loss": 1.13, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 460, "tokens_per_second_per_gpu": 17990.51, "total_tokens": 45555988 }, { "epoch": 0.02881970492623156, "grad_norm": 1.0969651937484741, "learning_rate": 5.753595997498437e-06, "loss": 1.0715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 461, "tokens_per_second_per_gpu": 17175.73, "total_tokens": 45652190 }, { "epoch": 0.028882220555138786, "grad_norm": 1.1395790576934814, "learning_rate": 5.766103814884303e-06, "loss": 1.0594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 462, "tokens_per_second_per_gpu": 16873.38, "total_tokens": 45747599 }, { "epoch": 0.028944736184046013, "grad_norm": 1.111646056175232, "learning_rate": 5.778611632270169e-06, "loss": 1.0992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 463, "tokens_per_second_per_gpu": 18531.78, "total_tokens": 45849541 }, { "epoch": 0.029007251812953237, "grad_norm": 1.3682749271392822, "learning_rate": 5.7911194496560355e-06, "loss": 1.1328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 464, "tokens_per_second_per_gpu": 16945.63, "total_tokens": 45948116 }, { "epoch": 0.029069767441860465, "grad_norm": 1.1388822793960571, "learning_rate": 5.803627267041902e-06, "loss": 1.0676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 465, "tokens_per_second_per_gpu": 17749.88, "total_tokens": 46045006 }, { "epoch": 0.029132283070767692, "grad_norm": 1.095513105392456, "learning_rate": 5.816135084427768e-06, "loss": 1.0563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 466, "tokens_per_second_per_gpu": 17715.79, "total_tokens": 46143681 }, { "epoch": 0.02919479869967492, "grad_norm": 1.04486083984375, "learning_rate": 5.828642901813634e-06, "loss": 1.0735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 467, "tokens_per_second_per_gpu": 17901.8, "total_tokens": 46247160 }, { "epoch": 0.029257314328582147, "grad_norm": 1.1023368835449219, "learning_rate": 5.8411507191995e-06, "loss": 1.0531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 468, "tokens_per_second_per_gpu": 17845.56, "total_tokens": 46349229 }, { "epoch": 0.02931982995748937, "grad_norm": 1.1176289319992065, "learning_rate": 5.853658536585366e-06, "loss": 1.1225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 469, "tokens_per_second_per_gpu": 17129.71, "total_tokens": 46447713 }, { "epoch": 0.0293823455863966, "grad_norm": 1.1543041467666626, "learning_rate": 5.866166353971232e-06, "loss": 1.0597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 470, "tokens_per_second_per_gpu": 17100.8, "total_tokens": 46548024 }, { "epoch": 0.029444861215303826, "grad_norm": 1.0232270956039429, "learning_rate": 5.878674171357099e-06, "loss": 0.9789, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 471, "tokens_per_second_per_gpu": 17372.85, "total_tokens": 46646243 }, { "epoch": 0.029507376844211054, "grad_norm": 1.075217366218567, "learning_rate": 5.891181988742965e-06, "loss": 1.0975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 472, "tokens_per_second_per_gpu": 17834.96, "total_tokens": 46750171 }, { "epoch": 0.02956989247311828, "grad_norm": 1.1459437608718872, "learning_rate": 5.903689806128831e-06, "loss": 1.1136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 473, "tokens_per_second_per_gpu": 17996.85, "total_tokens": 46850907 }, { "epoch": 0.029632408102025505, "grad_norm": 1.260699987411499, "learning_rate": 5.916197623514697e-06, "loss": 1.11, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 474, "tokens_per_second_per_gpu": 17949.5, "total_tokens": 46950891 }, { "epoch": 0.029694923730932733, "grad_norm": 1.0945090055465698, "learning_rate": 5.928705440900563e-06, "loss": 1.0514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 475, "tokens_per_second_per_gpu": 17551.64, "total_tokens": 47052118 }, { "epoch": 0.02975743935983996, "grad_norm": 1.1680456399917603, "learning_rate": 5.941213258286429e-06, "loss": 1.0647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 476, "tokens_per_second_per_gpu": 16194.07, "total_tokens": 47147589 }, { "epoch": 0.029819954988747188, "grad_norm": 1.057973861694336, "learning_rate": 5.953721075672295e-06, "loss": 1.0817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 477, "tokens_per_second_per_gpu": 16393.92, "total_tokens": 47246991 }, { "epoch": 0.029882470617654415, "grad_norm": 1.0720748901367188, "learning_rate": 5.966228893058161e-06, "loss": 1.056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 478, "tokens_per_second_per_gpu": 18808.7, "total_tokens": 47350390 }, { "epoch": 0.02994498624656164, "grad_norm": 1.0874660015106201, "learning_rate": 5.978736710444028e-06, "loss": 1.1159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 479, "tokens_per_second_per_gpu": 17646.81, "total_tokens": 47451802 }, { "epoch": 0.030007501875468866, "grad_norm": 1.0841914415359497, "learning_rate": 5.991244527829894e-06, "loss": 1.0914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 480, "tokens_per_second_per_gpu": 18961.94, "total_tokens": 47554295 }, { "epoch": 0.030070017504376094, "grad_norm": 1.0789419412612915, "learning_rate": 6.003752345215761e-06, "loss": 1.1088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 481, "tokens_per_second_per_gpu": 19007.76, "total_tokens": 47658405 }, { "epoch": 0.03013253313328332, "grad_norm": 1.186642050743103, "learning_rate": 6.016260162601627e-06, "loss": 1.0056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 482, "tokens_per_second_per_gpu": 14529.27, "total_tokens": 47746629 }, { "epoch": 0.03019504876219055, "grad_norm": 1.1088948249816895, "learning_rate": 6.028767979987493e-06, "loss": 1.152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 483, "tokens_per_second_per_gpu": 17870.35, "total_tokens": 47850324 }, { "epoch": 0.030257564391097773, "grad_norm": 1.0494903326034546, "learning_rate": 6.04127579737336e-06, "loss": 1.0789, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 484, "tokens_per_second_per_gpu": 18862.01, "total_tokens": 47954952 }, { "epoch": 0.030320080020005, "grad_norm": 1.9632081985473633, "learning_rate": 6.053783614759226e-06, "loss": 1.0696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 485, "tokens_per_second_per_gpu": 17649.42, "total_tokens": 48052330 }, { "epoch": 0.030382595648912228, "grad_norm": 1.0876951217651367, "learning_rate": 6.066291432145092e-06, "loss": 1.0228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 486, "tokens_per_second_per_gpu": 16662.96, "total_tokens": 48148147 }, { "epoch": 0.030445111277819455, "grad_norm": 1.117795705795288, "learning_rate": 6.078799249530958e-06, "loss": 1.058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 487, "tokens_per_second_per_gpu": 17136.38, "total_tokens": 48245338 }, { "epoch": 0.030507626906726683, "grad_norm": 1.3107823133468628, "learning_rate": 6.091307066916824e-06, "loss": 1.0955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 488, "tokens_per_second_per_gpu": 17089.08, "total_tokens": 48338479 }, { "epoch": 0.030570142535633907, "grad_norm": 1.7017078399658203, "learning_rate": 6.10381488430269e-06, "loss": 1.0486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 489, "tokens_per_second_per_gpu": 16057.96, "total_tokens": 48428187 }, { "epoch": 0.030632658164541134, "grad_norm": 1.1701784133911133, "learning_rate": 6.116322701688556e-06, "loss": 1.0922, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 490, "tokens_per_second_per_gpu": 17385.21, "total_tokens": 48528994 }, { "epoch": 0.030695173793448362, "grad_norm": 1.0953755378723145, "learning_rate": 6.128830519074423e-06, "loss": 1.053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 491, "tokens_per_second_per_gpu": 17718.0, "total_tokens": 48630483 }, { "epoch": 0.03075768942235559, "grad_norm": 1.220496416091919, "learning_rate": 6.141338336460289e-06, "loss": 1.0511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 492, "tokens_per_second_per_gpu": 16769.85, "total_tokens": 48728552 }, { "epoch": 0.030820205051262817, "grad_norm": 1.0470138788223267, "learning_rate": 6.153846153846155e-06, "loss": 0.9626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 493, "tokens_per_second_per_gpu": 16084.96, "total_tokens": 48821316 }, { "epoch": 0.030882720680170044, "grad_norm": 1.209664225578308, "learning_rate": 6.166353971232021e-06, "loss": 1.0074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 494, "tokens_per_second_per_gpu": 16115.79, "total_tokens": 48916461 }, { "epoch": 0.030945236309077268, "grad_norm": 1.106646180152893, "learning_rate": 6.178861788617887e-06, "loss": 1.0608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 495, "tokens_per_second_per_gpu": 16665.28, "total_tokens": 49014027 }, { "epoch": 0.031007751937984496, "grad_norm": 1.4720795154571533, "learning_rate": 6.191369606003753e-06, "loss": 1.1056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 496, "tokens_per_second_per_gpu": 17391.32, "total_tokens": 49115358 }, { "epoch": 0.031070267566891723, "grad_norm": 1.1911513805389404, "learning_rate": 6.203877423389619e-06, "loss": 1.0947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 497, "tokens_per_second_per_gpu": 16807.96, "total_tokens": 49211046 }, { "epoch": 0.03113278319579895, "grad_norm": 1.1658263206481934, "learning_rate": 6.216385240775485e-06, "loss": 1.077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 498, "tokens_per_second_per_gpu": 17023.16, "total_tokens": 49309267 }, { "epoch": 0.031195298824706178, "grad_norm": 1.1404635906219482, "learning_rate": 6.228893058161352e-06, "loss": 1.0721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 499, "tokens_per_second_per_gpu": 17649.46, "total_tokens": 49406360 }, { "epoch": 0.031257814453613406, "grad_norm": 2.001394748687744, "learning_rate": 6.241400875547218e-06, "loss": 1.0237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 500, "tokens_per_second_per_gpu": 18424.33, "total_tokens": 49507900 }, { "epoch": 0.03132033008252063, "grad_norm": 1.1404621601104736, "learning_rate": 6.253908692933084e-06, "loss": 1.0544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 501, "tokens_per_second_per_gpu": 17863.15, "total_tokens": 49609687 }, { "epoch": 0.031382845711427854, "grad_norm": 1.1328120231628418, "learning_rate": 6.26641651031895e-06, "loss": 1.0982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 502, "tokens_per_second_per_gpu": 17283.96, "total_tokens": 49709480 }, { "epoch": 0.031445361340335085, "grad_norm": 1.118098497390747, "learning_rate": 6.278924327704816e-06, "loss": 1.029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 503, "tokens_per_second_per_gpu": 17141.79, "total_tokens": 49801359 }, { "epoch": 0.03150787696924231, "grad_norm": 1.1152527332305908, "learning_rate": 6.291432145090682e-06, "loss": 1.053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 504, "tokens_per_second_per_gpu": 17521.73, "total_tokens": 49898109 }, { "epoch": 0.03157039259814954, "grad_norm": 1.3249907493591309, "learning_rate": 6.303939962476548e-06, "loss": 1.0588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 505, "tokens_per_second_per_gpu": 16401.78, "total_tokens": 49993390 }, { "epoch": 0.031632908227056764, "grad_norm": 1.1577504873275757, "learning_rate": 6.316447779862415e-06, "loss": 1.0349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 506, "tokens_per_second_per_gpu": 16801.42, "total_tokens": 50090595 }, { "epoch": 0.03169542385596399, "grad_norm": 1.1235395669937134, "learning_rate": 6.328955597248281e-06, "loss": 1.0828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 507, "tokens_per_second_per_gpu": 17819.04, "total_tokens": 50190409 }, { "epoch": 0.03175793948487122, "grad_norm": 1.0707616806030273, "learning_rate": 6.341463414634147e-06, "loss": 1.0667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 508, "tokens_per_second_per_gpu": 17629.87, "total_tokens": 50289929 }, { "epoch": 0.03182045511377844, "grad_norm": 1.0638599395751953, "learning_rate": 6.353971232020013e-06, "loss": 1.0728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 509, "tokens_per_second_per_gpu": 17631.16, "total_tokens": 50392027 }, { "epoch": 0.03188297074268567, "grad_norm": 1.089614748954773, "learning_rate": 6.366479049405879e-06, "loss": 1.11, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 510, "tokens_per_second_per_gpu": 17846.57, "total_tokens": 50494172 }, { "epoch": 0.0319454863715929, "grad_norm": 1.1235766410827637, "learning_rate": 6.378986866791745e-06, "loss": 1.0729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 511, "tokens_per_second_per_gpu": 17741.48, "total_tokens": 50595431 }, { "epoch": 0.03200800200050013, "grad_norm": 1.2737404108047485, "learning_rate": 6.391494684177611e-06, "loss": 1.0053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 512, "tokens_per_second_per_gpu": 17482.32, "total_tokens": 50688751 }, { "epoch": 0.03207051762940735, "grad_norm": 1.0552994012832642, "learning_rate": 6.404002501563477e-06, "loss": 0.9957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 513, "tokens_per_second_per_gpu": 17884.83, "total_tokens": 50788866 }, { "epoch": 0.032133033258314576, "grad_norm": 1.1688508987426758, "learning_rate": 6.416510318949344e-06, "loss": 1.1102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 514, "tokens_per_second_per_gpu": 17767.45, "total_tokens": 50886417 }, { "epoch": 0.03219554888722181, "grad_norm": 1.1169583797454834, "learning_rate": 6.42901813633521e-06, "loss": 1.0324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 515, "tokens_per_second_per_gpu": 18245.02, "total_tokens": 50983561 }, { "epoch": 0.03225806451612903, "grad_norm": 1.1165872812271118, "learning_rate": 6.441525953721076e-06, "loss": 1.08, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 516, "tokens_per_second_per_gpu": 16878.15, "total_tokens": 51080141 }, { "epoch": 0.03232058014503626, "grad_norm": 1.1104741096496582, "learning_rate": 6.454033771106942e-06, "loss": 1.0613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 517, "tokens_per_second_per_gpu": 17089.12, "total_tokens": 51176400 }, { "epoch": 0.032383095773943486, "grad_norm": 1.0512090921401978, "learning_rate": 6.466541588492808e-06, "loss": 1.0476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 518, "tokens_per_second_per_gpu": 17141.57, "total_tokens": 51277386 }, { "epoch": 0.03244561140285071, "grad_norm": 1.1209920644760132, "learning_rate": 6.479049405878674e-06, "loss": 1.0689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 519, "tokens_per_second_per_gpu": 16097.76, "total_tokens": 51372287 }, { "epoch": 0.03250812703175794, "grad_norm": 1.0807552337646484, "learning_rate": 6.4915572232645404e-06, "loss": 1.0629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 520, "tokens_per_second_per_gpu": 18292.54, "total_tokens": 51475554 }, { "epoch": 0.032570642660665165, "grad_norm": 1.2037302255630493, "learning_rate": 6.504065040650407e-06, "loss": 1.0883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 521, "tokens_per_second_per_gpu": 17325.85, "total_tokens": 51576678 }, { "epoch": 0.032633158289572396, "grad_norm": 1.1010154485702515, "learning_rate": 6.516572858036273e-06, "loss": 1.0803, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 522, "tokens_per_second_per_gpu": 17583.61, "total_tokens": 51677503 }, { "epoch": 0.03269567391847962, "grad_norm": 1.049636960029602, "learning_rate": 6.529080675422139e-06, "loss": 0.9972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 523, "tokens_per_second_per_gpu": 17971.65, "total_tokens": 51776999 }, { "epoch": 0.032758189547386844, "grad_norm": 1.111151933670044, "learning_rate": 6.5415884928080054e-06, "loss": 1.0306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 524, "tokens_per_second_per_gpu": 17582.67, "total_tokens": 51877421 }, { "epoch": 0.032820705176294075, "grad_norm": 1.053775668144226, "learning_rate": 6.5540963101938715e-06, "loss": 1.0433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 525, "tokens_per_second_per_gpu": 16233.83, "total_tokens": 51973256 }, { "epoch": 0.0328832208052013, "grad_norm": 1.1313804388046265, "learning_rate": 6.5666041275797375e-06, "loss": 1.0736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 526, "tokens_per_second_per_gpu": 17036.14, "total_tokens": 52070916 }, { "epoch": 0.03294573643410853, "grad_norm": 1.0182712078094482, "learning_rate": 6.5791119449656035e-06, "loss": 1.0165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 527, "tokens_per_second_per_gpu": 17299.5, "total_tokens": 52167957 }, { "epoch": 0.033008252063015754, "grad_norm": 1.0585274696350098, "learning_rate": 6.5916197623514696e-06, "loss": 1.0118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 528, "tokens_per_second_per_gpu": 15719.18, "total_tokens": 52263301 }, { "epoch": 0.03307076769192298, "grad_norm": 1.3586136102676392, "learning_rate": 6.6041275797373365e-06, "loss": 1.0615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 529, "tokens_per_second_per_gpu": 17076.09, "total_tokens": 52358691 }, { "epoch": 0.03313328332083021, "grad_norm": 1.1716058254241943, "learning_rate": 6.6166353971232025e-06, "loss": 1.1038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 530, "tokens_per_second_per_gpu": 16791.25, "total_tokens": 52456340 }, { "epoch": 0.03319579894973743, "grad_norm": 1.0633119344711304, "learning_rate": 6.6291432145090685e-06, "loss": 1.0045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 531, "tokens_per_second_per_gpu": 17342.45, "total_tokens": 52553764 }, { "epoch": 0.033258314578644664, "grad_norm": 1.1344215869903564, "learning_rate": 6.6416510318949346e-06, "loss": 1.0652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 532, "tokens_per_second_per_gpu": 17154.01, "total_tokens": 52653407 }, { "epoch": 0.03332083020755189, "grad_norm": 1.156873106956482, "learning_rate": 6.654158849280801e-06, "loss": 1.0962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 533, "tokens_per_second_per_gpu": 17197.46, "total_tokens": 52754179 }, { "epoch": 0.03338334583645911, "grad_norm": 1.147726058959961, "learning_rate": 6.666666666666667e-06, "loss": 1.0982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 534, "tokens_per_second_per_gpu": 18336.58, "total_tokens": 52858831 }, { "epoch": 0.03344586146536634, "grad_norm": 1.0439294576644897, "learning_rate": 6.679174484052533e-06, "loss": 1.0299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 535, "tokens_per_second_per_gpu": 17961.04, "total_tokens": 52962493 }, { "epoch": 0.03350837709427357, "grad_norm": 1.0282052755355835, "learning_rate": 6.691682301438399e-06, "loss": 0.9995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 536, "tokens_per_second_per_gpu": 17389.38, "total_tokens": 53061322 }, { "epoch": 0.0335708927231808, "grad_norm": 1.084859848022461, "learning_rate": 6.704190118824266e-06, "loss": 1.0299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 537, "tokens_per_second_per_gpu": 17899.56, "total_tokens": 53159256 }, { "epoch": 0.03363340835208802, "grad_norm": 1.1118218898773193, "learning_rate": 6.716697936210132e-06, "loss": 1.0986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 538, "tokens_per_second_per_gpu": 16590.93, "total_tokens": 53255416 }, { "epoch": 0.033695923980995246, "grad_norm": 1.0562024116516113, "learning_rate": 6.729205753595998e-06, "loss": 1.0551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 539, "tokens_per_second_per_gpu": 16948.73, "total_tokens": 53354455 }, { "epoch": 0.03375843960990248, "grad_norm": 1.0692859888076782, "learning_rate": 6.741713570981864e-06, "loss": 1.0762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 540, "tokens_per_second_per_gpu": 17550.7, "total_tokens": 53455860 }, { "epoch": 0.0338209552388097, "grad_norm": 1.1429063081741333, "learning_rate": 6.75422138836773e-06, "loss": 1.0572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 541, "tokens_per_second_per_gpu": 15876.01, "total_tokens": 53553611 }, { "epoch": 0.03388347086771693, "grad_norm": 1.0898284912109375, "learning_rate": 6.766729205753596e-06, "loss": 1.0889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 542, "tokens_per_second_per_gpu": 18062.5, "total_tokens": 53656588 }, { "epoch": 0.033945986496624156, "grad_norm": 1.0742151737213135, "learning_rate": 6.779237023139462e-06, "loss": 1.0659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 543, "tokens_per_second_per_gpu": 17181.94, "total_tokens": 53754209 }, { "epoch": 0.03400850212553138, "grad_norm": 1.1417852640151978, "learning_rate": 6.791744840525329e-06, "loss": 1.0941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 544, "tokens_per_second_per_gpu": 17948.75, "total_tokens": 53859542 }, { "epoch": 0.03407101775443861, "grad_norm": 1.0499979257583618, "learning_rate": 6.804252657911196e-06, "loss": 1.0413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 545, "tokens_per_second_per_gpu": 17465.7, "total_tokens": 53960234 }, { "epoch": 0.034133533383345835, "grad_norm": 1.1164790391921997, "learning_rate": 6.816760475297062e-06, "loss": 1.0143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 546, "tokens_per_second_per_gpu": 17174.91, "total_tokens": 54059934 }, { "epoch": 0.034196049012253066, "grad_norm": 1.021003246307373, "learning_rate": 6.829268292682928e-06, "loss": 1.0201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 547, "tokens_per_second_per_gpu": 18029.83, "total_tokens": 54161278 }, { "epoch": 0.03425856464116029, "grad_norm": 1.0299428701400757, "learning_rate": 6.841776110068794e-06, "loss": 1.0112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 548, "tokens_per_second_per_gpu": 16983.13, "total_tokens": 54259152 }, { "epoch": 0.034321080270067514, "grad_norm": 1.0530954599380493, "learning_rate": 6.854283927454661e-06, "loss": 1.0487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 549, "tokens_per_second_per_gpu": 17512.37, "total_tokens": 54358176 }, { "epoch": 0.034383595898974745, "grad_norm": 1.2138445377349854, "learning_rate": 6.866791744840527e-06, "loss": 1.0111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 550, "tokens_per_second_per_gpu": 16529.43, "total_tokens": 54454675 }, { "epoch": 0.03444611152788197, "grad_norm": 1.160725474357605, "learning_rate": 6.879299562226393e-06, "loss": 1.0242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 551, "tokens_per_second_per_gpu": 16924.09, "total_tokens": 54551168 }, { "epoch": 0.0345086271567892, "grad_norm": 1.05306077003479, "learning_rate": 6.891807379612259e-06, "loss": 1.0463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 552, "tokens_per_second_per_gpu": 18376.85, "total_tokens": 54654274 }, { "epoch": 0.034571142785696424, "grad_norm": 1.1761448383331299, "learning_rate": 6.904315196998125e-06, "loss": 1.0464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 553, "tokens_per_second_per_gpu": 16686.61, "total_tokens": 54750895 }, { "epoch": 0.03463365841460365, "grad_norm": 1.0608294010162354, "learning_rate": 6.916823014383991e-06, "loss": 1.0078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 554, "tokens_per_second_per_gpu": 16995.09, "total_tokens": 54846964 }, { "epoch": 0.03469617404351088, "grad_norm": 1.0874844789505005, "learning_rate": 6.929330831769857e-06, "loss": 1.0546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 555, "tokens_per_second_per_gpu": 15526.19, "total_tokens": 54942993 }, { "epoch": 0.0347586896724181, "grad_norm": 1.0430949926376343, "learning_rate": 6.941838649155723e-06, "loss": 1.0457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 556, "tokens_per_second_per_gpu": 17432.63, "total_tokens": 55043333 }, { "epoch": 0.034821205301325334, "grad_norm": 1.2068184614181519, "learning_rate": 6.95434646654159e-06, "loss": 1.0855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 557, "tokens_per_second_per_gpu": 17257.2, "total_tokens": 55145057 }, { "epoch": 0.03488372093023256, "grad_norm": 1.0935083627700806, "learning_rate": 6.966854283927456e-06, "loss": 1.0181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 558, "tokens_per_second_per_gpu": 16865.98, "total_tokens": 55239796 }, { "epoch": 0.03494623655913978, "grad_norm": 1.0839476585388184, "learning_rate": 6.979362101313322e-06, "loss": 1.0497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 559, "tokens_per_second_per_gpu": 17247.27, "total_tokens": 55338220 }, { "epoch": 0.03500875218804701, "grad_norm": 1.1263092756271362, "learning_rate": 6.991869918699188e-06, "loss": 1.029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 560, "tokens_per_second_per_gpu": 16764.86, "total_tokens": 55436933 }, { "epoch": 0.03507126781695424, "grad_norm": 1.046097993850708, "learning_rate": 7.004377736085054e-06, "loss": 1.0325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 561, "tokens_per_second_per_gpu": 17476.31, "total_tokens": 55539122 }, { "epoch": 0.03513378344586147, "grad_norm": 1.084578037261963, "learning_rate": 7.01688555347092e-06, "loss": 1.0164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 562, "tokens_per_second_per_gpu": 15393.84, "total_tokens": 55631422 }, { "epoch": 0.03519629907476869, "grad_norm": 1.0731617212295532, "learning_rate": 7.029393370856786e-06, "loss": 1.0441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 563, "tokens_per_second_per_gpu": 18656.77, "total_tokens": 55731689 }, { "epoch": 0.035258814703675916, "grad_norm": 1.1027075052261353, "learning_rate": 7.041901188242653e-06, "loss": 1.0504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 564, "tokens_per_second_per_gpu": 17540.99, "total_tokens": 55831066 }, { "epoch": 0.035321330332583147, "grad_norm": 1.0640535354614258, "learning_rate": 7.054409005628519e-06, "loss": 1.0701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 565, "tokens_per_second_per_gpu": 18310.52, "total_tokens": 55930465 }, { "epoch": 0.03538384596149037, "grad_norm": 1.1233407258987427, "learning_rate": 7.066916823014385e-06, "loss": 1.0641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 566, "tokens_per_second_per_gpu": 17201.79, "total_tokens": 56033293 }, { "epoch": 0.0354463615903976, "grad_norm": 1.0252200365066528, "learning_rate": 7.079424640400251e-06, "loss": 1.0326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 567, "tokens_per_second_per_gpu": 17066.0, "total_tokens": 56131252 }, { "epoch": 0.035508877219304825, "grad_norm": 1.1106477975845337, "learning_rate": 7.091932457786117e-06, "loss": 1.0953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 568, "tokens_per_second_per_gpu": 17196.32, "total_tokens": 56231370 }, { "epoch": 0.035571392848212056, "grad_norm": 1.1648356914520264, "learning_rate": 7.104440275171983e-06, "loss": 1.0533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 569, "tokens_per_second_per_gpu": 16697.14, "total_tokens": 56328674 }, { "epoch": 0.03563390847711928, "grad_norm": 1.0212934017181396, "learning_rate": 7.116948092557849e-06, "loss": 0.9728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 570, "tokens_per_second_per_gpu": 17706.04, "total_tokens": 56429136 }, { "epoch": 0.035696424106026504, "grad_norm": 1.0284733772277832, "learning_rate": 7.129455909943715e-06, "loss": 1.0736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 571, "tokens_per_second_per_gpu": 17923.92, "total_tokens": 56534630 }, { "epoch": 0.035758939734933735, "grad_norm": 1.031531810760498, "learning_rate": 7.141963727329582e-06, "loss": 0.9489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 572, "tokens_per_second_per_gpu": 15731.7, "total_tokens": 56631283 }, { "epoch": 0.03582145536384096, "grad_norm": 1.1559412479400635, "learning_rate": 7.154471544715448e-06, "loss": 1.033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 573, "tokens_per_second_per_gpu": 15257.22, "total_tokens": 56724400 }, { "epoch": 0.03588397099274819, "grad_norm": 1.0123579502105713, "learning_rate": 7.166979362101314e-06, "loss": 1.0033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 574, "tokens_per_second_per_gpu": 17604.14, "total_tokens": 56824195 }, { "epoch": 0.035946486621655414, "grad_norm": 1.044584035873413, "learning_rate": 7.17948717948718e-06, "loss": 1.0532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 575, "tokens_per_second_per_gpu": 18051.25, "total_tokens": 56928073 }, { "epoch": 0.03600900225056264, "grad_norm": 1.0175098180770874, "learning_rate": 7.191994996873046e-06, "loss": 0.9909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 576, "tokens_per_second_per_gpu": 17654.12, "total_tokens": 57027078 }, { "epoch": 0.03607151787946987, "grad_norm": 1.0853376388549805, "learning_rate": 7.204502814258912e-06, "loss": 1.0267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 577, "tokens_per_second_per_gpu": 18093.3, "total_tokens": 57125853 }, { "epoch": 0.03613403350837709, "grad_norm": 1.1698778867721558, "learning_rate": 7.217010631644778e-06, "loss": 1.0141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 578, "tokens_per_second_per_gpu": 16951.22, "total_tokens": 57224078 }, { "epoch": 0.036196549137284324, "grad_norm": 1.125531792640686, "learning_rate": 7.229518449030645e-06, "loss": 1.0381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 579, "tokens_per_second_per_gpu": 17506.42, "total_tokens": 57320702 }, { "epoch": 0.03625906476619155, "grad_norm": 1.0789942741394043, "learning_rate": 7.242026266416511e-06, "loss": 1.0704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 580, "tokens_per_second_per_gpu": 17792.87, "total_tokens": 57422084 }, { "epoch": 0.03632158039509877, "grad_norm": 1.107340931892395, "learning_rate": 7.254534083802377e-06, "loss": 1.0887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 581, "tokens_per_second_per_gpu": 17251.16, "total_tokens": 57523822 }, { "epoch": 0.036384096024006, "grad_norm": 1.111557126045227, "learning_rate": 7.267041901188243e-06, "loss": 1.108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 582, "tokens_per_second_per_gpu": 16744.11, "total_tokens": 57621722 }, { "epoch": 0.03644661165291323, "grad_norm": 1.0797030925750732, "learning_rate": 7.279549718574109e-06, "loss": 1.0385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 583, "tokens_per_second_per_gpu": 16569.34, "total_tokens": 57719440 }, { "epoch": 0.03650912728182046, "grad_norm": 1.183166265487671, "learning_rate": 7.292057535959975e-06, "loss": 1.0226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 584, "tokens_per_second_per_gpu": 17919.5, "total_tokens": 57820652 }, { "epoch": 0.03657164291072768, "grad_norm": 1.024739384651184, "learning_rate": 7.304565353345841e-06, "loss": 1.0694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 585, "tokens_per_second_per_gpu": 17380.49, "total_tokens": 57922951 }, { "epoch": 0.036634158539634906, "grad_norm": 1.0482553243637085, "learning_rate": 7.317073170731707e-06, "loss": 1.0238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 586, "tokens_per_second_per_gpu": 18813.35, "total_tokens": 58025289 }, { "epoch": 0.03669667416854214, "grad_norm": 1.0778489112854004, "learning_rate": 7.329580988117574e-06, "loss": 1.0725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 587, "tokens_per_second_per_gpu": 17374.87, "total_tokens": 58127162 }, { "epoch": 0.03675918979744936, "grad_norm": 1.0098536014556885, "learning_rate": 7.34208880550344e-06, "loss": 0.9926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 588, "tokens_per_second_per_gpu": 17153.35, "total_tokens": 58225947 }, { "epoch": 0.03682170542635659, "grad_norm": 1.0541441440582275, "learning_rate": 7.354596622889306e-06, "loss": 1.0536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 589, "tokens_per_second_per_gpu": 17882.01, "total_tokens": 58326531 }, { "epoch": 0.036884221055263816, "grad_norm": 1.0590852499008179, "learning_rate": 7.367104440275172e-06, "loss": 1.047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 590, "tokens_per_second_per_gpu": 17178.91, "total_tokens": 58426856 }, { "epoch": 0.03694673668417104, "grad_norm": 1.0637949705123901, "learning_rate": 7.379612257661038e-06, "loss": 1.0099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 591, "tokens_per_second_per_gpu": 17380.38, "total_tokens": 58525216 }, { "epoch": 0.03700925231307827, "grad_norm": 1.0737063884735107, "learning_rate": 7.3921200750469045e-06, "loss": 1.0726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 592, "tokens_per_second_per_gpu": 16747.39, "total_tokens": 58626188 }, { "epoch": 0.037071767941985495, "grad_norm": 1.109482765197754, "learning_rate": 7.4046278924327705e-06, "loss": 1.0789, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 593, "tokens_per_second_per_gpu": 18063.92, "total_tokens": 58726974 }, { "epoch": 0.037134283570892726, "grad_norm": 1.0172772407531738, "learning_rate": 7.417135709818637e-06, "loss": 0.9459, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 594, "tokens_per_second_per_gpu": 15829.54, "total_tokens": 58821487 }, { "epoch": 0.03719679919979995, "grad_norm": 1.095457911491394, "learning_rate": 7.429643527204503e-06, "loss": 1.035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 595, "tokens_per_second_per_gpu": 16090.5, "total_tokens": 58914096 }, { "epoch": 0.037259314828707174, "grad_norm": 1.0873768329620361, "learning_rate": 7.4421513445903694e-06, "loss": 1.0245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 596, "tokens_per_second_per_gpu": 16017.71, "total_tokens": 59006387 }, { "epoch": 0.037321830457614405, "grad_norm": 1.046357274055481, "learning_rate": 7.4546591619762355e-06, "loss": 1.0421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 597, "tokens_per_second_per_gpu": 17828.87, "total_tokens": 59104672 }, { "epoch": 0.03738434608652163, "grad_norm": 1.0038423538208008, "learning_rate": 7.4671669793621015e-06, "loss": 1.0185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 598, "tokens_per_second_per_gpu": 18390.11, "total_tokens": 59205935 }, { "epoch": 0.03744686171542886, "grad_norm": 1.069287657737732, "learning_rate": 7.4796747967479676e-06, "loss": 1.0142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 599, "tokens_per_second_per_gpu": 17756.64, "total_tokens": 59301555 }, { "epoch": 0.037509377344336084, "grad_norm": 1.1375997066497803, "learning_rate": 7.492182614133834e-06, "loss": 1.0368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 600, "tokens_per_second_per_gpu": 16813.73, "total_tokens": 59401219 }, { "epoch": 0.03757189297324331, "grad_norm": 1.047926664352417, "learning_rate": 7.5046904315197e-06, "loss": 0.9955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 601, "tokens_per_second_per_gpu": 17252.01, "total_tokens": 59501164 }, { "epoch": 0.03763440860215054, "grad_norm": 1.1632094383239746, "learning_rate": 7.5171982489055665e-06, "loss": 0.9635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 602, "tokens_per_second_per_gpu": 16424.58, "total_tokens": 59597430 }, { "epoch": 0.03769692423105776, "grad_norm": 1.0753744840621948, "learning_rate": 7.5297060662914326e-06, "loss": 1.0145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 603, "tokens_per_second_per_gpu": 17414.13, "total_tokens": 59693550 }, { "epoch": 0.037759439859964994, "grad_norm": 1.1921579837799072, "learning_rate": 7.542213883677299e-06, "loss": 1.0538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 604, "tokens_per_second_per_gpu": 17629.92, "total_tokens": 59789318 }, { "epoch": 0.03782195548887222, "grad_norm": 1.0842394828796387, "learning_rate": 7.554721701063165e-06, "loss": 1.0437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 605, "tokens_per_second_per_gpu": 17491.42, "total_tokens": 59885630 }, { "epoch": 0.03788447111777944, "grad_norm": 1.088498830795288, "learning_rate": 7.567229518449031e-06, "loss": 0.9926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 606, "tokens_per_second_per_gpu": 17030.94, "total_tokens": 59982366 }, { "epoch": 0.03794698674668667, "grad_norm": 1.0762128829956055, "learning_rate": 7.579737335834897e-06, "loss": 1.0236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 607, "tokens_per_second_per_gpu": 17555.47, "total_tokens": 60083512 }, { "epoch": 0.0380095023755939, "grad_norm": 1.181146502494812, "learning_rate": 7.592245153220763e-06, "loss": 1.0907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 608, "tokens_per_second_per_gpu": 18180.44, "total_tokens": 60185520 }, { "epoch": 0.03807201800450113, "grad_norm": 1.04274320602417, "learning_rate": 7.6047529706066305e-06, "loss": 1.0062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 609, "tokens_per_second_per_gpu": 16783.62, "total_tokens": 60282225 }, { "epoch": 0.03813453363340835, "grad_norm": 1.0638254880905151, "learning_rate": 7.6172607879924965e-06, "loss": 0.9346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 610, "tokens_per_second_per_gpu": 14657.95, "total_tokens": 60375465 }, { "epoch": 0.038197049262315576, "grad_norm": 1.0938230752944946, "learning_rate": 7.629768605378363e-06, "loss": 1.0735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 611, "tokens_per_second_per_gpu": 17589.33, "total_tokens": 60474658 }, { "epoch": 0.03825956489122281, "grad_norm": 1.0141123533248901, "learning_rate": 7.64227642276423e-06, "loss": 0.997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 612, "tokens_per_second_per_gpu": 17715.01, "total_tokens": 60573392 }, { "epoch": 0.03832208052013003, "grad_norm": 1.0626444816589355, "learning_rate": 7.654784240150095e-06, "loss": 0.9641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 613, "tokens_per_second_per_gpu": 16724.71, "total_tokens": 60668393 }, { "epoch": 0.03838459614903726, "grad_norm": 1.074157953262329, "learning_rate": 7.667292057535962e-06, "loss": 1.0598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 614, "tokens_per_second_per_gpu": 16905.12, "total_tokens": 60769164 }, { "epoch": 0.038447111777944486, "grad_norm": 1.0444896221160889, "learning_rate": 7.679799874921827e-06, "loss": 1.0187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 615, "tokens_per_second_per_gpu": 18033.59, "total_tokens": 60867504 }, { "epoch": 0.03850962740685171, "grad_norm": 1.1583240032196045, "learning_rate": 7.692307692307694e-06, "loss": 1.0197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 616, "tokens_per_second_per_gpu": 18177.39, "total_tokens": 60963765 }, { "epoch": 0.03857214303575894, "grad_norm": 0.989948034286499, "learning_rate": 7.704815509693559e-06, "loss": 0.9666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 617, "tokens_per_second_per_gpu": 17774.31, "total_tokens": 61063583 }, { "epoch": 0.038634658664666165, "grad_norm": 1.020642638206482, "learning_rate": 7.717323327079426e-06, "loss": 0.9903, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 618, "tokens_per_second_per_gpu": 15809.31, "total_tokens": 61160429 }, { "epoch": 0.038697174293573396, "grad_norm": 1.0726666450500488, "learning_rate": 7.729831144465293e-06, "loss": 1.0056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 619, "tokens_per_second_per_gpu": 18319.19, "total_tokens": 61256432 }, { "epoch": 0.03875968992248062, "grad_norm": 1.0883163213729858, "learning_rate": 7.742338961851158e-06, "loss": 1.0331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 620, "tokens_per_second_per_gpu": 16309.75, "total_tokens": 61350530 }, { "epoch": 0.038822205551387844, "grad_norm": 1.0423600673675537, "learning_rate": 7.754846779237025e-06, "loss": 1.0387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 621, "tokens_per_second_per_gpu": 16379.67, "total_tokens": 61446552 }, { "epoch": 0.038884721180295075, "grad_norm": 1.0379198789596558, "learning_rate": 7.76735459662289e-06, "loss": 1.0792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 622, "tokens_per_second_per_gpu": 18842.54, "total_tokens": 61551908 }, { "epoch": 0.0389472368092023, "grad_norm": 1.001206636428833, "learning_rate": 7.779862414008757e-06, "loss": 0.9925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 623, "tokens_per_second_per_gpu": 17482.59, "total_tokens": 61651785 }, { "epoch": 0.03900975243810953, "grad_norm": 1.0570077896118164, "learning_rate": 7.792370231394622e-06, "loss": 1.0414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 624, "tokens_per_second_per_gpu": 18471.22, "total_tokens": 61754651 }, { "epoch": 0.039072268067016754, "grad_norm": 1.0638846158981323, "learning_rate": 7.804878048780489e-06, "loss": 1.0331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 625, "tokens_per_second_per_gpu": 17539.79, "total_tokens": 61854931 }, { "epoch": 0.03913478369592398, "grad_norm": 1.0643562078475952, "learning_rate": 7.817385866166356e-06, "loss": 1.037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 626, "tokens_per_second_per_gpu": 16972.52, "total_tokens": 61951313 }, { "epoch": 0.03919729932483121, "grad_norm": 1.0176595449447632, "learning_rate": 7.829893683552221e-06, "loss": 0.95, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 627, "tokens_per_second_per_gpu": 17205.82, "total_tokens": 62046687 }, { "epoch": 0.03925981495373843, "grad_norm": 1.0082674026489258, "learning_rate": 7.842401500938088e-06, "loss": 1.0216, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 628, "tokens_per_second_per_gpu": 17401.73, "total_tokens": 62147354 }, { "epoch": 0.03932233058264566, "grad_norm": 1.0619206428527832, "learning_rate": 7.854909318323953e-06, "loss": 1.0199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 629, "tokens_per_second_per_gpu": 17215.22, "total_tokens": 62246795 }, { "epoch": 0.03938484621155289, "grad_norm": 1.0950030088424683, "learning_rate": 7.86741713570982e-06, "loss": 1.026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 630, "tokens_per_second_per_gpu": 18190.14, "total_tokens": 62343866 }, { "epoch": 0.03944736184046012, "grad_norm": 1.0404874086380005, "learning_rate": 7.879924953095685e-06, "loss": 0.9826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 631, "tokens_per_second_per_gpu": 15876.86, "total_tokens": 62439209 }, { "epoch": 0.03950987746936734, "grad_norm": 1.0336198806762695, "learning_rate": 7.892432770481552e-06, "loss": 1.002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 632, "tokens_per_second_per_gpu": 16001.03, "total_tokens": 62533449 }, { "epoch": 0.039572393098274566, "grad_norm": 1.098371982574463, "learning_rate": 7.904940587867417e-06, "loss": 1.0333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 633, "tokens_per_second_per_gpu": 16386.73, "total_tokens": 62629538 }, { "epoch": 0.0396349087271818, "grad_norm": 1.0392698049545288, "learning_rate": 7.917448405253284e-06, "loss": 1.0331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 634, "tokens_per_second_per_gpu": 17033.65, "total_tokens": 62726853 }, { "epoch": 0.03969742435608902, "grad_norm": 1.0308947563171387, "learning_rate": 7.92995622263915e-06, "loss": 1.0008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 635, "tokens_per_second_per_gpu": 17905.12, "total_tokens": 62827878 }, { "epoch": 0.03975993998499625, "grad_norm": 1.1663248538970947, "learning_rate": 7.942464040025016e-06, "loss": 1.0526, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 636, "tokens_per_second_per_gpu": 18835.12, "total_tokens": 62934636 }, { "epoch": 0.039822455613903476, "grad_norm": 1.0661823749542236, "learning_rate": 7.954971857410883e-06, "loss": 0.9965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 637, "tokens_per_second_per_gpu": 17800.43, "total_tokens": 63033132 }, { "epoch": 0.0398849712428107, "grad_norm": 1.0269558429718018, "learning_rate": 7.967479674796748e-06, "loss": 1.0336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 638, "tokens_per_second_per_gpu": 18296.3, "total_tokens": 63132927 }, { "epoch": 0.03994748687171793, "grad_norm": 1.0754063129425049, "learning_rate": 7.979987492182615e-06, "loss": 1.033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 639, "tokens_per_second_per_gpu": 16053.7, "total_tokens": 63229355 }, { "epoch": 0.040010002500625155, "grad_norm": 1.0204157829284668, "learning_rate": 7.99249530956848e-06, "loss": 0.9814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 640, "tokens_per_second_per_gpu": 16884.74, "total_tokens": 63330008 }, { "epoch": 0.040072518129532386, "grad_norm": 1.1178241968154907, "learning_rate": 8.005003126954347e-06, "loss": 1.0359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 641, "tokens_per_second_per_gpu": 17006.9, "total_tokens": 63426211 }, { "epoch": 0.04013503375843961, "grad_norm": 1.0520209074020386, "learning_rate": 8.017510944340214e-06, "loss": 1.0082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 642, "tokens_per_second_per_gpu": 17303.32, "total_tokens": 63526319 }, { "epoch": 0.040197549387346834, "grad_norm": 1.0144275426864624, "learning_rate": 8.030018761726079e-06, "loss": 0.9858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 643, "tokens_per_second_per_gpu": 17837.46, "total_tokens": 63627415 }, { "epoch": 0.040260065016254065, "grad_norm": 1.105050802230835, "learning_rate": 8.042526579111946e-06, "loss": 1.0307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 644, "tokens_per_second_per_gpu": 17990.33, "total_tokens": 63729879 }, { "epoch": 0.04032258064516129, "grad_norm": 1.0494874715805054, "learning_rate": 8.055034396497811e-06, "loss": 1.0338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 645, "tokens_per_second_per_gpu": 17491.3, "total_tokens": 63828067 }, { "epoch": 0.04038509627406852, "grad_norm": 1.0339964628219604, "learning_rate": 8.067542213883678e-06, "loss": 1.0099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 646, "tokens_per_second_per_gpu": 16410.83, "total_tokens": 63925888 }, { "epoch": 0.040447611902975744, "grad_norm": 1.0570247173309326, "learning_rate": 8.080050031269543e-06, "loss": 0.9947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 647, "tokens_per_second_per_gpu": 17343.97, "total_tokens": 64028969 }, { "epoch": 0.04051012753188297, "grad_norm": 1.0483418703079224, "learning_rate": 8.09255784865541e-06, "loss": 0.9919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 648, "tokens_per_second_per_gpu": 16447.86, "total_tokens": 64126269 }, { "epoch": 0.0405726431607902, "grad_norm": 1.064203143119812, "learning_rate": 8.105065666041277e-06, "loss": 1.0138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 649, "tokens_per_second_per_gpu": 16171.95, "total_tokens": 64222525 }, { "epoch": 0.04063515878969742, "grad_norm": 1.0141490697860718, "learning_rate": 8.117573483427142e-06, "loss": 0.9767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 650, "tokens_per_second_per_gpu": 17450.99, "total_tokens": 64324221 }, { "epoch": 0.040697674418604654, "grad_norm": 1.0169841051101685, "learning_rate": 8.130081300813009e-06, "loss": 0.9713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 651, "tokens_per_second_per_gpu": 18969.84, "total_tokens": 64425472 }, { "epoch": 0.04076019004751188, "grad_norm": 1.060351848602295, "learning_rate": 8.142589118198874e-06, "loss": 0.9997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 652, "tokens_per_second_per_gpu": 17188.09, "total_tokens": 64523197 }, { "epoch": 0.0408227056764191, "grad_norm": 1.0584394931793213, "learning_rate": 8.155096935584741e-06, "loss": 0.9675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 653, "tokens_per_second_per_gpu": 17075.59, "total_tokens": 64620663 }, { "epoch": 0.04088522130532633, "grad_norm": 1.1030757427215576, "learning_rate": 8.167604752970606e-06, "loss": 1.0943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 654, "tokens_per_second_per_gpu": 17869.91, "total_tokens": 64725505 }, { "epoch": 0.04094773693423356, "grad_norm": 1.0245373249053955, "learning_rate": 8.180112570356473e-06, "loss": 0.9389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 655, "tokens_per_second_per_gpu": 17983.48, "total_tokens": 64821084 }, { "epoch": 0.04101025256314079, "grad_norm": 1.0356584787368774, "learning_rate": 8.19262038774234e-06, "loss": 1.0057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 656, "tokens_per_second_per_gpu": 16984.16, "total_tokens": 64917873 }, { "epoch": 0.04107276819204801, "grad_norm": 1.0784592628479004, "learning_rate": 8.205128205128205e-06, "loss": 1.0619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 657, "tokens_per_second_per_gpu": 17201.12, "total_tokens": 65019309 }, { "epoch": 0.041135283820955236, "grad_norm": 1.0471861362457275, "learning_rate": 8.217636022514072e-06, "loss": 1.0333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 658, "tokens_per_second_per_gpu": 17515.85, "total_tokens": 65117179 }, { "epoch": 0.04119779944986247, "grad_norm": 1.0470298528671265, "learning_rate": 8.230143839899937e-06, "loss": 0.995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 659, "tokens_per_second_per_gpu": 17513.39, "total_tokens": 65214745 }, { "epoch": 0.04126031507876969, "grad_norm": 1.0318520069122314, "learning_rate": 8.242651657285804e-06, "loss": 0.9766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 660, "tokens_per_second_per_gpu": 16941.42, "total_tokens": 65315580 }, { "epoch": 0.04132283070767692, "grad_norm": 0.9885413646697998, "learning_rate": 8.25515947467167e-06, "loss": 1.0156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 661, "tokens_per_second_per_gpu": 17877.72, "total_tokens": 65416675 }, { "epoch": 0.041385346336584146, "grad_norm": 1.0653547048568726, "learning_rate": 8.267667292057536e-06, "loss": 1.0406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 662, "tokens_per_second_per_gpu": 18233.94, "total_tokens": 65519712 }, { "epoch": 0.04144786196549137, "grad_norm": 1.1048734188079834, "learning_rate": 8.280175109443402e-06, "loss": 1.0367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 663, "tokens_per_second_per_gpu": 18634.71, "total_tokens": 65623899 }, { "epoch": 0.0415103775943986, "grad_norm": 1.055181860923767, "learning_rate": 8.292682926829268e-06, "loss": 0.9798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 664, "tokens_per_second_per_gpu": 16485.46, "total_tokens": 65717811 }, { "epoch": 0.041572893223305825, "grad_norm": 1.045939564704895, "learning_rate": 8.305190744215135e-06, "loss": 0.9808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 665, "tokens_per_second_per_gpu": 16610.6, "total_tokens": 65816534 }, { "epoch": 0.041635408852213056, "grad_norm": 1.078974962234497, "learning_rate": 8.317698561601e-06, "loss": 1.0381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 666, "tokens_per_second_per_gpu": 18017.56, "total_tokens": 65916755 }, { "epoch": 0.04169792448112028, "grad_norm": 1.005890965461731, "learning_rate": 8.330206378986867e-06, "loss": 0.989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 667, "tokens_per_second_per_gpu": 16887.99, "total_tokens": 66015841 }, { "epoch": 0.041760440110027504, "grad_norm": 1.0609387159347534, "learning_rate": 8.342714196372733e-06, "loss": 0.9938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 668, "tokens_per_second_per_gpu": 17760.66, "total_tokens": 66116442 }, { "epoch": 0.041822955738934735, "grad_norm": 1.085809588432312, "learning_rate": 8.3552220137586e-06, "loss": 1.0226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 669, "tokens_per_second_per_gpu": 16529.77, "total_tokens": 66210453 }, { "epoch": 0.04188547136784196, "grad_norm": 1.0462361574172974, "learning_rate": 8.367729831144465e-06, "loss": 1.0509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 670, "tokens_per_second_per_gpu": 17971.8, "total_tokens": 66309791 }, { "epoch": 0.04194798699674919, "grad_norm": 1.1196411848068237, "learning_rate": 8.380237648530332e-06, "loss": 0.9642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 671, "tokens_per_second_per_gpu": 16382.45, "total_tokens": 66406163 }, { "epoch": 0.042010502625656414, "grad_norm": 1.0357646942138672, "learning_rate": 8.392745465916198e-06, "loss": 0.9854, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 672, "tokens_per_second_per_gpu": 18283.86, "total_tokens": 66504982 }, { "epoch": 0.04207301825456364, "grad_norm": 1.0503365993499756, "learning_rate": 8.405253283302065e-06, "loss": 1.0098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 673, "tokens_per_second_per_gpu": 17246.66, "total_tokens": 66603243 }, { "epoch": 0.04213553388347087, "grad_norm": 0.9950414299964905, "learning_rate": 8.41776110068793e-06, "loss": 0.9947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 674, "tokens_per_second_per_gpu": 18034.94, "total_tokens": 66707213 }, { "epoch": 0.04219804951237809, "grad_norm": 1.037438154220581, "learning_rate": 8.430268918073797e-06, "loss": 1.0149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 675, "tokens_per_second_per_gpu": 17967.74, "total_tokens": 66805880 }, { "epoch": 0.042260565141285324, "grad_norm": 1.0076608657836914, "learning_rate": 8.442776735459664e-06, "loss": 1.0138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 676, "tokens_per_second_per_gpu": 18342.14, "total_tokens": 66907905 }, { "epoch": 0.04232308077019255, "grad_norm": 1.0710400342941284, "learning_rate": 8.45528455284553e-06, "loss": 1.0181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 677, "tokens_per_second_per_gpu": 17962.58, "total_tokens": 67010542 }, { "epoch": 0.04238559639909977, "grad_norm": 1.038191795349121, "learning_rate": 8.467792370231396e-06, "loss": 0.9873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 678, "tokens_per_second_per_gpu": 17853.77, "total_tokens": 67109114 }, { "epoch": 0.042448112028007, "grad_norm": 1.0188418626785278, "learning_rate": 8.480300187617262e-06, "loss": 1.0239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 679, "tokens_per_second_per_gpu": 17732.49, "total_tokens": 67213368 }, { "epoch": 0.04251062765691423, "grad_norm": 1.0128191709518433, "learning_rate": 8.492808005003128e-06, "loss": 0.9454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 680, "tokens_per_second_per_gpu": 16897.37, "total_tokens": 67310524 }, { "epoch": 0.04257314328582146, "grad_norm": 1.0403424501419067, "learning_rate": 8.505315822388994e-06, "loss": 1.0536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 681, "tokens_per_second_per_gpu": 18145.5, "total_tokens": 67412377 }, { "epoch": 0.04263565891472868, "grad_norm": 1.081223487854004, "learning_rate": 8.51782363977486e-06, "loss": 0.9782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 682, "tokens_per_second_per_gpu": 16178.02, "total_tokens": 67503662 }, { "epoch": 0.042698174543635906, "grad_norm": 1.050121784210205, "learning_rate": 8.530331457160726e-06, "loss": 1.0395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 683, "tokens_per_second_per_gpu": 17947.35, "total_tokens": 67607443 }, { "epoch": 0.04276069017254314, "grad_norm": 1.0576289892196655, "learning_rate": 8.542839274546593e-06, "loss": 0.9739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 684, "tokens_per_second_per_gpu": 16880.47, "total_tokens": 67706112 }, { "epoch": 0.04282320580145036, "grad_norm": 1.0366255044937134, "learning_rate": 8.55534709193246e-06, "loss": 0.9587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 685, "tokens_per_second_per_gpu": 16334.13, "total_tokens": 67797434 }, { "epoch": 0.04288572143035759, "grad_norm": 1.0193862915039062, "learning_rate": 8.567854909318325e-06, "loss": 0.973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 686, "tokens_per_second_per_gpu": 17278.46, "total_tokens": 67896879 }, { "epoch": 0.042948237059264815, "grad_norm": 1.0495675802230835, "learning_rate": 8.580362726704192e-06, "loss": 1.0126, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 687, "tokens_per_second_per_gpu": 17532.65, "total_tokens": 67997117 }, { "epoch": 0.043010752688172046, "grad_norm": 1.0633172988891602, "learning_rate": 8.592870544090057e-06, "loss": 1.0021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 688, "tokens_per_second_per_gpu": 16963.15, "total_tokens": 68098872 }, { "epoch": 0.04307326831707927, "grad_norm": 1.030452847480774, "learning_rate": 8.605378361475924e-06, "loss": 0.9764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 689, "tokens_per_second_per_gpu": 17250.82, "total_tokens": 68196794 }, { "epoch": 0.043135783945986494, "grad_norm": 1.052037239074707, "learning_rate": 8.617886178861789e-06, "loss": 1.0027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 690, "tokens_per_second_per_gpu": 17158.05, "total_tokens": 68294500 }, { "epoch": 0.043198299574893725, "grad_norm": 1.0130367279052734, "learning_rate": 8.630393996247656e-06, "loss": 1.0369, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 691, "tokens_per_second_per_gpu": 17661.59, "total_tokens": 68397747 }, { "epoch": 0.04326081520380095, "grad_norm": 0.9834839105606079, "learning_rate": 8.642901813633523e-06, "loss": 0.969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 692, "tokens_per_second_per_gpu": 17223.42, "total_tokens": 68496923 }, { "epoch": 0.04332333083270818, "grad_norm": 1.007452130317688, "learning_rate": 8.655409631019388e-06, "loss": 0.9848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 693, "tokens_per_second_per_gpu": 16630.17, "total_tokens": 68597498 }, { "epoch": 0.043385846461615404, "grad_norm": 1.0606247186660767, "learning_rate": 8.667917448405255e-06, "loss": 1.0068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 694, "tokens_per_second_per_gpu": 16500.8, "total_tokens": 68693805 }, { "epoch": 0.04344836209052263, "grad_norm": 1.03005850315094, "learning_rate": 8.68042526579112e-06, "loss": 0.9755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 695, "tokens_per_second_per_gpu": 17010.51, "total_tokens": 68793728 }, { "epoch": 0.04351087771942986, "grad_norm": 1.0977177619934082, "learning_rate": 8.692933083176987e-06, "loss": 1.006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 696, "tokens_per_second_per_gpu": 17529.7, "total_tokens": 68894482 }, { "epoch": 0.04357339334833708, "grad_norm": 1.0160439014434814, "learning_rate": 8.705440900562852e-06, "loss": 0.9671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 697, "tokens_per_second_per_gpu": 17135.53, "total_tokens": 68993055 }, { "epoch": 0.043635908977244314, "grad_norm": 1.0288403034210205, "learning_rate": 8.717948717948719e-06, "loss": 0.987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 698, "tokens_per_second_per_gpu": 17822.21, "total_tokens": 69091849 }, { "epoch": 0.04369842460615154, "grad_norm": 1.041219711303711, "learning_rate": 8.730456535334586e-06, "loss": 0.9962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 699, "tokens_per_second_per_gpu": 17077.98, "total_tokens": 69189570 }, { "epoch": 0.04376094023505876, "grad_norm": 1.0363926887512207, "learning_rate": 8.742964352720451e-06, "loss": 1.0132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 700, "tokens_per_second_per_gpu": 17582.23, "total_tokens": 69290156 }, { "epoch": 0.04382345586396599, "grad_norm": 1.0821119546890259, "learning_rate": 8.755472170106318e-06, "loss": 1.0732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 701, "tokens_per_second_per_gpu": 17792.91, "total_tokens": 69392670 }, { "epoch": 0.04388597149287322, "grad_norm": 1.0143803358078003, "learning_rate": 8.767979987492183e-06, "loss": 1.0372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 702, "tokens_per_second_per_gpu": 17809.43, "total_tokens": 69495004 }, { "epoch": 0.04394848712178045, "grad_norm": 1.0586916208267212, "learning_rate": 8.78048780487805e-06, "loss": 1.0234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 703, "tokens_per_second_per_gpu": 16490.75, "total_tokens": 69590967 }, { "epoch": 0.04401100275068767, "grad_norm": 1.0173606872558594, "learning_rate": 8.792995622263915e-06, "loss": 0.9959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 704, "tokens_per_second_per_gpu": 17042.68, "total_tokens": 69688188 }, { "epoch": 0.044073518379594896, "grad_norm": 1.0029289722442627, "learning_rate": 8.805503439649782e-06, "loss": 0.9189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 705, "tokens_per_second_per_gpu": 16931.14, "total_tokens": 69785738 }, { "epoch": 0.04413603400850213, "grad_norm": 1.0009256601333618, "learning_rate": 8.818011257035647e-06, "loss": 1.0043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 706, "tokens_per_second_per_gpu": 17705.29, "total_tokens": 69887505 }, { "epoch": 0.04419854963740935, "grad_norm": 1.092395544052124, "learning_rate": 8.830519074421514e-06, "loss": 0.9737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 707, "tokens_per_second_per_gpu": 17451.62, "total_tokens": 69985280 }, { "epoch": 0.04426106526631658, "grad_norm": 1.0376931428909302, "learning_rate": 8.843026891807381e-06, "loss": 1.0059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 708, "tokens_per_second_per_gpu": 17819.49, "total_tokens": 70084683 }, { "epoch": 0.044323580895223806, "grad_norm": 1.0448869466781616, "learning_rate": 8.855534709193246e-06, "loss": 0.9887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 709, "tokens_per_second_per_gpu": 16395.54, "total_tokens": 70183125 }, { "epoch": 0.04438609652413103, "grad_norm": 1.1959556341171265, "learning_rate": 8.868042526579113e-06, "loss": 1.0153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 710, "tokens_per_second_per_gpu": 16692.86, "total_tokens": 70277087 }, { "epoch": 0.04444861215303826, "grad_norm": 1.0978741645812988, "learning_rate": 8.880550343964978e-06, "loss": 1.0335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 711, "tokens_per_second_per_gpu": 16739.2, "total_tokens": 70375481 }, { "epoch": 0.044511127781945485, "grad_norm": 1.0227068662643433, "learning_rate": 8.893058161350845e-06, "loss": 1.0054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 712, "tokens_per_second_per_gpu": 17630.71, "total_tokens": 70476874 }, { "epoch": 0.044573643410852716, "grad_norm": 0.9695127606391907, "learning_rate": 8.90556597873671e-06, "loss": 0.9894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 713, "tokens_per_second_per_gpu": 17474.69, "total_tokens": 70579371 }, { "epoch": 0.04463615903975994, "grad_norm": 1.0991657972335815, "learning_rate": 8.918073796122577e-06, "loss": 1.0129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 714, "tokens_per_second_per_gpu": 17895.18, "total_tokens": 70678350 }, { "epoch": 0.044698674668667164, "grad_norm": 1.0860685110092163, "learning_rate": 8.930581613508444e-06, "loss": 0.9287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 715, "tokens_per_second_per_gpu": 16155.14, "total_tokens": 70770335 }, { "epoch": 0.044761190297574395, "grad_norm": 1.0201932191848755, "learning_rate": 8.94308943089431e-06, "loss": 0.9836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 716, "tokens_per_second_per_gpu": 18213.34, "total_tokens": 70872002 }, { "epoch": 0.04482370592648162, "grad_norm": 1.075561761856079, "learning_rate": 8.955597248280176e-06, "loss": 1.0356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 717, "tokens_per_second_per_gpu": 17942.82, "total_tokens": 70973776 }, { "epoch": 0.04488622155538885, "grad_norm": 1.0667518377304077, "learning_rate": 8.968105065666041e-06, "loss": 1.0187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 718, "tokens_per_second_per_gpu": 17420.86, "total_tokens": 71071626 }, { "epoch": 0.044948737184296074, "grad_norm": 1.0669569969177246, "learning_rate": 8.980612883051908e-06, "loss": 1.0022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 719, "tokens_per_second_per_gpu": 18548.13, "total_tokens": 71172413 }, { "epoch": 0.0450112528132033, "grad_norm": 1.1653071641921997, "learning_rate": 8.993120700437773e-06, "loss": 1.0198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 720, "tokens_per_second_per_gpu": 18055.77, "total_tokens": 71270640 }, { "epoch": 0.04507376844211053, "grad_norm": 1.0196864604949951, "learning_rate": 9.00562851782364e-06, "loss": 0.9419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 721, "tokens_per_second_per_gpu": 17537.05, "total_tokens": 71368553 }, { "epoch": 0.04513628407101775, "grad_norm": 1.082931637763977, "learning_rate": 9.018136335209507e-06, "loss": 0.983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 722, "tokens_per_second_per_gpu": 16722.63, "total_tokens": 71464960 }, { "epoch": 0.045198799699924984, "grad_norm": 1.0342251062393188, "learning_rate": 9.030644152595372e-06, "loss": 0.9976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 723, "tokens_per_second_per_gpu": 17207.73, "total_tokens": 71563494 }, { "epoch": 0.04526131532883221, "grad_norm": 1.0946743488311768, "learning_rate": 9.04315196998124e-06, "loss": 0.9965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 724, "tokens_per_second_per_gpu": 17122.19, "total_tokens": 71660001 }, { "epoch": 0.04532383095773943, "grad_norm": 0.9987621903419495, "learning_rate": 9.055659787367104e-06, "loss": 0.9756, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 725, "tokens_per_second_per_gpu": 17932.88, "total_tokens": 71764581 }, { "epoch": 0.04538634658664666, "grad_norm": 0.9846581220626831, "learning_rate": 9.068167604752971e-06, "loss": 0.9786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 726, "tokens_per_second_per_gpu": 18582.43, "total_tokens": 71868585 }, { "epoch": 0.04544886221555389, "grad_norm": 1.4627597332000732, "learning_rate": 9.080675422138836e-06, "loss": 0.9689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 727, "tokens_per_second_per_gpu": 17946.26, "total_tokens": 71967837 }, { "epoch": 0.04551137784446112, "grad_norm": 1.073559045791626, "learning_rate": 9.093183239524703e-06, "loss": 0.919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 728, "tokens_per_second_per_gpu": 17284.65, "total_tokens": 72062545 }, { "epoch": 0.04557389347336834, "grad_norm": 0.9912831783294678, "learning_rate": 9.10569105691057e-06, "loss": 0.9426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 729, "tokens_per_second_per_gpu": 16860.32, "total_tokens": 72161938 }, { "epoch": 0.045636409102275566, "grad_norm": 1.043105959892273, "learning_rate": 9.118198874296435e-06, "loss": 0.9609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 730, "tokens_per_second_per_gpu": 17102.09, "total_tokens": 72258746 }, { "epoch": 0.0456989247311828, "grad_norm": 1.0600335597991943, "learning_rate": 9.130706691682302e-06, "loss": 1.0118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 731, "tokens_per_second_per_gpu": 16618.27, "total_tokens": 72352819 }, { "epoch": 0.04576144036009002, "grad_norm": 1.084679126739502, "learning_rate": 9.143214509068168e-06, "loss": 0.9879, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 732, "tokens_per_second_per_gpu": 16806.34, "total_tokens": 72454544 }, { "epoch": 0.04582395598899725, "grad_norm": 1.0286197662353516, "learning_rate": 9.155722326454034e-06, "loss": 0.9804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 733, "tokens_per_second_per_gpu": 18562.3, "total_tokens": 72555853 }, { "epoch": 0.045886471617904476, "grad_norm": 1.0279263257980347, "learning_rate": 9.1682301438399e-06, "loss": 0.9707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 734, "tokens_per_second_per_gpu": 17905.5, "total_tokens": 72652891 }, { "epoch": 0.0459489872468117, "grad_norm": 1.0427989959716797, "learning_rate": 9.180737961225766e-06, "loss": 0.996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 735, "tokens_per_second_per_gpu": 17935.09, "total_tokens": 72755046 }, { "epoch": 0.04601150287571893, "grad_norm": 1.102614164352417, "learning_rate": 9.193245778611632e-06, "loss": 0.9597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 736, "tokens_per_second_per_gpu": 16431.71, "total_tokens": 72851830 }, { "epoch": 0.046074018504626155, "grad_norm": 1.0628125667572021, "learning_rate": 9.2057535959975e-06, "loss": 1.0056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 737, "tokens_per_second_per_gpu": 17224.75, "total_tokens": 72950650 }, { "epoch": 0.046136534133533386, "grad_norm": 1.0420893430709839, "learning_rate": 9.218261413383365e-06, "loss": 0.9519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 738, "tokens_per_second_per_gpu": 17586.45, "total_tokens": 73046193 }, { "epoch": 0.04619904976244061, "grad_norm": 1.005391240119934, "learning_rate": 9.230769230769232e-06, "loss": 0.9998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 739, "tokens_per_second_per_gpu": 18674.44, "total_tokens": 73148565 }, { "epoch": 0.046261565391347834, "grad_norm": 1.0289547443389893, "learning_rate": 9.243277048155097e-06, "loss": 0.9371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 740, "tokens_per_second_per_gpu": 17260.58, "total_tokens": 73247852 }, { "epoch": 0.046324081020255065, "grad_norm": 1.1664276123046875, "learning_rate": 9.255784865540964e-06, "loss": 1.0098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 741, "tokens_per_second_per_gpu": 17574.36, "total_tokens": 73346533 }, { "epoch": 0.04638659664916229, "grad_norm": 1.0804905891418457, "learning_rate": 9.268292682926831e-06, "loss": 0.9907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 742, "tokens_per_second_per_gpu": 16806.47, "total_tokens": 73444288 }, { "epoch": 0.04644911227806952, "grad_norm": 1.0033332109451294, "learning_rate": 9.280800500312696e-06, "loss": 0.9762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 743, "tokens_per_second_per_gpu": 17665.43, "total_tokens": 73544037 }, { "epoch": 0.046511627906976744, "grad_norm": 1.2003915309906006, "learning_rate": 9.293308317698563e-06, "loss": 0.9951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 744, "tokens_per_second_per_gpu": 16131.48, "total_tokens": 73642795 }, { "epoch": 0.04657414353588397, "grad_norm": 1.0545881986618042, "learning_rate": 9.305816135084429e-06, "loss": 0.978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 745, "tokens_per_second_per_gpu": 16754.02, "total_tokens": 73741884 }, { "epoch": 0.0466366591647912, "grad_norm": 1.0865731239318848, "learning_rate": 9.318323952470295e-06, "loss": 0.9919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 746, "tokens_per_second_per_gpu": 16144.28, "total_tokens": 73834439 }, { "epoch": 0.04669917479369842, "grad_norm": 1.054713249206543, "learning_rate": 9.33083176985616e-06, "loss": 0.9844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 747, "tokens_per_second_per_gpu": 17060.37, "total_tokens": 73932552 }, { "epoch": 0.04676169042260565, "grad_norm": 1.0302708148956299, "learning_rate": 9.343339587242027e-06, "loss": 0.9788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 748, "tokens_per_second_per_gpu": 17659.72, "total_tokens": 74034606 }, { "epoch": 0.04682420605151288, "grad_norm": 1.0759634971618652, "learning_rate": 9.355847404627894e-06, "loss": 0.9862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 749, "tokens_per_second_per_gpu": 15195.72, "total_tokens": 74128201 }, { "epoch": 0.04688672168042011, "grad_norm": 1.028421401977539, "learning_rate": 9.36835522201376e-06, "loss": 0.9879, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 750, "tokens_per_second_per_gpu": 18102.75, "total_tokens": 74223993 }, { "epoch": 0.04694923730932733, "grad_norm": 0.9716877937316895, "learning_rate": 9.380863039399626e-06, "loss": 0.8702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 751, "tokens_per_second_per_gpu": 18021.83, "total_tokens": 74321554 }, { "epoch": 0.047011752938234556, "grad_norm": 1.0178648233413696, "learning_rate": 9.393370856785492e-06, "loss": 0.9624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 752, "tokens_per_second_per_gpu": 17095.83, "total_tokens": 74419552 }, { "epoch": 0.04707426856714179, "grad_norm": 1.0558632612228394, "learning_rate": 9.405878674171359e-06, "loss": 1.0019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 753, "tokens_per_second_per_gpu": 17459.18, "total_tokens": 74518912 }, { "epoch": 0.04713678419604901, "grad_norm": 1.0613312721252441, "learning_rate": 9.418386491557224e-06, "loss": 1.0114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 754, "tokens_per_second_per_gpu": 18414.41, "total_tokens": 74618572 }, { "epoch": 0.04719929982495624, "grad_norm": 1.0385421514511108, "learning_rate": 9.43089430894309e-06, "loss": 0.9643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 755, "tokens_per_second_per_gpu": 17281.8, "total_tokens": 74716114 }, { "epoch": 0.047261815453863466, "grad_norm": 1.022834300994873, "learning_rate": 9.443402126328956e-06, "loss": 0.9741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 756, "tokens_per_second_per_gpu": 17420.85, "total_tokens": 74815128 }, { "epoch": 0.04732433108277069, "grad_norm": 1.0562372207641602, "learning_rate": 9.455909943714823e-06, "loss": 0.9836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 757, "tokens_per_second_per_gpu": 16838.73, "total_tokens": 74915495 }, { "epoch": 0.04738684671167792, "grad_norm": 1.0459721088409424, "learning_rate": 9.46841776110069e-06, "loss": 0.9977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 758, "tokens_per_second_per_gpu": 17224.37, "total_tokens": 75012822 }, { "epoch": 0.047449362340585145, "grad_norm": 1.145971655845642, "learning_rate": 9.480925578486555e-06, "loss": 0.9965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 759, "tokens_per_second_per_gpu": 16947.46, "total_tokens": 75107708 }, { "epoch": 0.047511877969492376, "grad_norm": 1.0258965492248535, "learning_rate": 9.493433395872422e-06, "loss": 0.9275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 760, "tokens_per_second_per_gpu": 16920.9, "total_tokens": 75205966 }, { "epoch": 0.0475743935983996, "grad_norm": 1.0582674741744995, "learning_rate": 9.505941213258287e-06, "loss": 0.9575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 761, "tokens_per_second_per_gpu": 16378.45, "total_tokens": 75299280 }, { "epoch": 0.047636909227306824, "grad_norm": 0.9776800274848938, "learning_rate": 9.518449030644154e-06, "loss": 0.9226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 762, "tokens_per_second_per_gpu": 18456.71, "total_tokens": 75400951 }, { "epoch": 0.047699424856214055, "grad_norm": 1.0246539115905762, "learning_rate": 9.530956848030019e-06, "loss": 0.9403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 763, "tokens_per_second_per_gpu": 17128.48, "total_tokens": 75500384 }, { "epoch": 0.04776194048512128, "grad_norm": 1.0595742464065552, "learning_rate": 9.543464665415886e-06, "loss": 0.8856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 764, "tokens_per_second_per_gpu": 17386.88, "total_tokens": 75593135 }, { "epoch": 0.04782445611402851, "grad_norm": 1.0924584865570068, "learning_rate": 9.555972482801753e-06, "loss": 0.997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 765, "tokens_per_second_per_gpu": 15763.01, "total_tokens": 75687616 }, { "epoch": 0.047886971742935734, "grad_norm": 1.0840437412261963, "learning_rate": 9.568480300187618e-06, "loss": 0.9682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 766, "tokens_per_second_per_gpu": 17260.68, "total_tokens": 75786405 }, { "epoch": 0.04794948737184296, "grad_norm": 1.048980712890625, "learning_rate": 9.580988117573485e-06, "loss": 0.9385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 767, "tokens_per_second_per_gpu": 16764.81, "total_tokens": 75881145 }, { "epoch": 0.04801200300075019, "grad_norm": 1.0469979047775269, "learning_rate": 9.59349593495935e-06, "loss": 0.94, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 768, "tokens_per_second_per_gpu": 17549.4, "total_tokens": 75981191 }, { "epoch": 0.04807451862965741, "grad_norm": 1.0038214921951294, "learning_rate": 9.606003752345217e-06, "loss": 0.9426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 769, "tokens_per_second_per_gpu": 18482.32, "total_tokens": 76083063 }, { "epoch": 0.048137034258564644, "grad_norm": 1.0696794986724854, "learning_rate": 9.618511569731082e-06, "loss": 0.984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 770, "tokens_per_second_per_gpu": 18306.33, "total_tokens": 76183317 }, { "epoch": 0.04819954988747187, "grad_norm": 1.062656283378601, "learning_rate": 9.631019387116949e-06, "loss": 0.9561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 771, "tokens_per_second_per_gpu": 17175.16, "total_tokens": 76280930 }, { "epoch": 0.04826206551637909, "grad_norm": 1.0170997381210327, "learning_rate": 9.643527204502816e-06, "loss": 0.9712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 772, "tokens_per_second_per_gpu": 17412.0, "total_tokens": 76383159 }, { "epoch": 0.04832458114528632, "grad_norm": 1.0313242673873901, "learning_rate": 9.656035021888681e-06, "loss": 0.9835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 773, "tokens_per_second_per_gpu": 17787.53, "total_tokens": 76482470 }, { "epoch": 0.04838709677419355, "grad_norm": 1.0616264343261719, "learning_rate": 9.668542839274548e-06, "loss": 0.9602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 774, "tokens_per_second_per_gpu": 16470.14, "total_tokens": 76578535 }, { "epoch": 0.04844961240310078, "grad_norm": 0.9732285141944885, "learning_rate": 9.681050656660413e-06, "loss": 0.947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 775, "tokens_per_second_per_gpu": 18560.34, "total_tokens": 76679466 }, { "epoch": 0.048512128032008, "grad_norm": 1.0937846899032593, "learning_rate": 9.69355847404628e-06, "loss": 0.9808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 776, "tokens_per_second_per_gpu": 16578.78, "total_tokens": 76774649 }, { "epoch": 0.048574643660915226, "grad_norm": 1.0108044147491455, "learning_rate": 9.706066291432145e-06, "loss": 1.0153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 777, "tokens_per_second_per_gpu": 18587.98, "total_tokens": 76878094 }, { "epoch": 0.04863715928982246, "grad_norm": 1.0031976699829102, "learning_rate": 9.718574108818012e-06, "loss": 0.9884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 778, "tokens_per_second_per_gpu": 16732.9, "total_tokens": 76978842 }, { "epoch": 0.04869967491872968, "grad_norm": 1.0587764978408813, "learning_rate": 9.731081926203877e-06, "loss": 0.9808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 779, "tokens_per_second_per_gpu": 16755.37, "total_tokens": 77076267 }, { "epoch": 0.04876219054763691, "grad_norm": 0.9879472851753235, "learning_rate": 9.743589743589744e-06, "loss": 0.9758, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 780, "tokens_per_second_per_gpu": 18077.59, "total_tokens": 77177287 }, { "epoch": 0.048824706176544136, "grad_norm": 1.0008749961853027, "learning_rate": 9.756097560975611e-06, "loss": 1.006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 781, "tokens_per_second_per_gpu": 18176.15, "total_tokens": 77280333 }, { "epoch": 0.04888722180545136, "grad_norm": 1.0539566278457642, "learning_rate": 9.768605378361476e-06, "loss": 0.9997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 782, "tokens_per_second_per_gpu": 16738.35, "total_tokens": 77378180 }, { "epoch": 0.04894973743435859, "grad_norm": 0.9802868962287903, "learning_rate": 9.781113195747343e-06, "loss": 0.9689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 783, "tokens_per_second_per_gpu": 18468.44, "total_tokens": 77478150 }, { "epoch": 0.049012253063265815, "grad_norm": 1.066696047782898, "learning_rate": 9.793621013133208e-06, "loss": 0.9916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 784, "tokens_per_second_per_gpu": 17722.97, "total_tokens": 77574423 }, { "epoch": 0.049074768692173046, "grad_norm": 1.0246009826660156, "learning_rate": 9.806128830519075e-06, "loss": 0.9676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 785, "tokens_per_second_per_gpu": 17553.12, "total_tokens": 77670443 }, { "epoch": 0.04913728432108027, "grad_norm": 1.003182053565979, "learning_rate": 9.81863664790494e-06, "loss": 0.9833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 786, "tokens_per_second_per_gpu": 18299.35, "total_tokens": 77771766 }, { "epoch": 0.049199799949987494, "grad_norm": 1.0348275899887085, "learning_rate": 9.831144465290807e-06, "loss": 0.9451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 787, "tokens_per_second_per_gpu": 17115.89, "total_tokens": 77870162 }, { "epoch": 0.049262315578894725, "grad_norm": 1.04762864112854, "learning_rate": 9.843652282676674e-06, "loss": 0.9768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 788, "tokens_per_second_per_gpu": 16406.78, "total_tokens": 77964704 }, { "epoch": 0.04932483120780195, "grad_norm": 1.0355188846588135, "learning_rate": 9.85616010006254e-06, "loss": 0.9838, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 789, "tokens_per_second_per_gpu": 18201.7, "total_tokens": 78062691 }, { "epoch": 0.04938734683670918, "grad_norm": 1.1201255321502686, "learning_rate": 9.868667917448406e-06, "loss": 1.0098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 790, "tokens_per_second_per_gpu": 16654.46, "total_tokens": 78162429 }, { "epoch": 0.049449862465616404, "grad_norm": 1.0202667713165283, "learning_rate": 9.881175734834271e-06, "loss": 0.9673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 791, "tokens_per_second_per_gpu": 18039.83, "total_tokens": 78259185 }, { "epoch": 0.04951237809452363, "grad_norm": 1.0788789987564087, "learning_rate": 9.893683552220138e-06, "loss": 0.9754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 792, "tokens_per_second_per_gpu": 16558.72, "total_tokens": 78350625 }, { "epoch": 0.04957489372343086, "grad_norm": 1.0393844842910767, "learning_rate": 9.906191369606003e-06, "loss": 1.0249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 793, "tokens_per_second_per_gpu": 18829.42, "total_tokens": 78451634 }, { "epoch": 0.04963740935233808, "grad_norm": 1.0329643487930298, "learning_rate": 9.91869918699187e-06, "loss": 0.9564, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 794, "tokens_per_second_per_gpu": 16795.27, "total_tokens": 78548372 }, { "epoch": 0.049699924981245314, "grad_norm": 1.0399225950241089, "learning_rate": 9.931207004377737e-06, "loss": 0.9797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 795, "tokens_per_second_per_gpu": 16857.49, "total_tokens": 78645079 }, { "epoch": 0.04976244061015254, "grad_norm": 1.0653363466262817, "learning_rate": 9.943714821763602e-06, "loss": 1.0311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 796, "tokens_per_second_per_gpu": 17877.8, "total_tokens": 78744978 }, { "epoch": 0.04982495623905976, "grad_norm": 1.0731875896453857, "learning_rate": 9.95622263914947e-06, "loss": 0.9633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 797, "tokens_per_second_per_gpu": 18058.1, "total_tokens": 78842054 }, { "epoch": 0.04988747186796699, "grad_norm": 1.0546882152557373, "learning_rate": 9.968730456535334e-06, "loss": 0.884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 798, "tokens_per_second_per_gpu": 15515.08, "total_tokens": 78932033 }, { "epoch": 0.04994998749687422, "grad_norm": 1.0341078042984009, "learning_rate": 9.981238273921201e-06, "loss": 0.9682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 799, "tokens_per_second_per_gpu": 17349.21, "total_tokens": 79031184 }, { "epoch": 0.05001250312578145, "grad_norm": 1.0862802267074585, "learning_rate": 9.993746091307067e-06, "loss": 0.9984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 800, "tokens_per_second_per_gpu": 17320.88, "total_tokens": 79131465 }, { "epoch": 0.05007501875468867, "grad_norm": 0.9725344777107239, "learning_rate": 1.0006253908692933e-05, "loss": 0.8936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 801, "tokens_per_second_per_gpu": 17421.8, "total_tokens": 79225585 }, { "epoch": 0.050137534383595896, "grad_norm": 1.0372965335845947, "learning_rate": 1.00187617260788e-05, "loss": 0.9822, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 802, "tokens_per_second_per_gpu": 18529.66, "total_tokens": 79330150 }, { "epoch": 0.05020005001250313, "grad_norm": 1.1088688373565674, "learning_rate": 1.0031269543464665e-05, "loss": 0.9855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 803, "tokens_per_second_per_gpu": 16821.9, "total_tokens": 79427940 }, { "epoch": 0.05026256564141035, "grad_norm": 1.0507253408432007, "learning_rate": 1.0043777360850532e-05, "loss": 0.9511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 804, "tokens_per_second_per_gpu": 17541.31, "total_tokens": 79525771 }, { "epoch": 0.05032508127031758, "grad_norm": 1.0845991373062134, "learning_rate": 1.0056285178236398e-05, "loss": 0.9786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 805, "tokens_per_second_per_gpu": 16872.2, "total_tokens": 79622117 }, { "epoch": 0.050387596899224806, "grad_norm": 1.0392522811889648, "learning_rate": 1.0068792995622264e-05, "loss": 0.9994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 806, "tokens_per_second_per_gpu": 16914.95, "total_tokens": 79720796 }, { "epoch": 0.050450112528132036, "grad_norm": 1.0525448322296143, "learning_rate": 1.008130081300813e-05, "loss": 0.9392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 807, "tokens_per_second_per_gpu": 16535.05, "total_tokens": 79816507 }, { "epoch": 0.05051262815703926, "grad_norm": 1.0910948514938354, "learning_rate": 1.0093808630393998e-05, "loss": 0.9606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 808, "tokens_per_second_per_gpu": 17250.91, "total_tokens": 79918031 }, { "epoch": 0.050575143785946484, "grad_norm": 1.0214064121246338, "learning_rate": 1.0106316447779862e-05, "loss": 1.0348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 809, "tokens_per_second_per_gpu": 18789.21, "total_tokens": 80021660 }, { "epoch": 0.050637659414853715, "grad_norm": 1.0162416696548462, "learning_rate": 1.011882426516573e-05, "loss": 0.9468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 810, "tokens_per_second_per_gpu": 18594.84, "total_tokens": 80124706 }, { "epoch": 0.05070017504376094, "grad_norm": 1.0874924659729004, "learning_rate": 1.0131332082551595e-05, "loss": 0.9873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 811, "tokens_per_second_per_gpu": 17068.05, "total_tokens": 80216450 }, { "epoch": 0.05076269067266817, "grad_norm": 1.1003644466400146, "learning_rate": 1.0143839899937462e-05, "loss": 0.9563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 812, "tokens_per_second_per_gpu": 16728.11, "total_tokens": 80310262 }, { "epoch": 0.050825206301575394, "grad_norm": 1.075901746749878, "learning_rate": 1.0156347717323328e-05, "loss": 0.9764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 813, "tokens_per_second_per_gpu": 17916.33, "total_tokens": 80410351 }, { "epoch": 0.05088772193048262, "grad_norm": 1.0603152513504028, "learning_rate": 1.0168855534709194e-05, "loss": 0.9628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 814, "tokens_per_second_per_gpu": 17566.78, "total_tokens": 80510433 }, { "epoch": 0.05095023755938985, "grad_norm": 1.0319116115570068, "learning_rate": 1.018136335209506e-05, "loss": 0.9699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 815, "tokens_per_second_per_gpu": 18179.93, "total_tokens": 80609953 }, { "epoch": 0.05101275318829707, "grad_norm": 1.0376226902008057, "learning_rate": 1.0193871169480927e-05, "loss": 0.98, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 816, "tokens_per_second_per_gpu": 17082.66, "total_tokens": 80710134 }, { "epoch": 0.051075268817204304, "grad_norm": 1.0449283123016357, "learning_rate": 1.0206378986866792e-05, "loss": 0.9433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 817, "tokens_per_second_per_gpu": 17783.64, "total_tokens": 80805878 }, { "epoch": 0.05113778444611153, "grad_norm": 1.0327750444412231, "learning_rate": 1.0218886804252659e-05, "loss": 0.9029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 818, "tokens_per_second_per_gpu": 16981.5, "total_tokens": 80900935 }, { "epoch": 0.05120030007501875, "grad_norm": 0.9718929529190063, "learning_rate": 1.0231394621638524e-05, "loss": 0.9477, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 819, "tokens_per_second_per_gpu": 18643.12, "total_tokens": 81004328 }, { "epoch": 0.05126281570392598, "grad_norm": 1.0463371276855469, "learning_rate": 1.024390243902439e-05, "loss": 0.9374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 820, "tokens_per_second_per_gpu": 17941.01, "total_tokens": 81101679 }, { "epoch": 0.05132533133283321, "grad_norm": 1.0625747442245483, "learning_rate": 1.0256410256410256e-05, "loss": 1.0247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 821, "tokens_per_second_per_gpu": 17255.35, "total_tokens": 81200079 }, { "epoch": 0.05138784696174044, "grad_norm": 1.0199379920959473, "learning_rate": 1.0268918073796124e-05, "loss": 0.9135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 822, "tokens_per_second_per_gpu": 16755.58, "total_tokens": 81293660 }, { "epoch": 0.05145036259064766, "grad_norm": 1.010695457458496, "learning_rate": 1.0281425891181988e-05, "loss": 0.9359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 823, "tokens_per_second_per_gpu": 16909.61, "total_tokens": 81389684 }, { "epoch": 0.051512878219554886, "grad_norm": 1.0123769044876099, "learning_rate": 1.0293933708567857e-05, "loss": 0.9508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 824, "tokens_per_second_per_gpu": 17464.09, "total_tokens": 81489344 }, { "epoch": 0.05157539384846212, "grad_norm": 1.0670197010040283, "learning_rate": 1.0306441525953722e-05, "loss": 0.9417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 825, "tokens_per_second_per_gpu": 15593.53, "total_tokens": 81582439 }, { "epoch": 0.05163790947736934, "grad_norm": 1.0057096481323242, "learning_rate": 1.0318949343339589e-05, "loss": 0.9514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 826, "tokens_per_second_per_gpu": 17450.89, "total_tokens": 81681958 }, { "epoch": 0.05170042510627657, "grad_norm": 1.0065478086471558, "learning_rate": 1.0331457160725454e-05, "loss": 0.9261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 827, "tokens_per_second_per_gpu": 16285.23, "total_tokens": 81776696 }, { "epoch": 0.051762940735183796, "grad_norm": 1.0041675567626953, "learning_rate": 1.034396497811132e-05, "loss": 0.9678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 828, "tokens_per_second_per_gpu": 18395.75, "total_tokens": 81877023 }, { "epoch": 0.05182545636409102, "grad_norm": 1.0151362419128418, "learning_rate": 1.0356472795497186e-05, "loss": 0.9238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 829, "tokens_per_second_per_gpu": 17483.3, "total_tokens": 81973458 }, { "epoch": 0.05188797199299825, "grad_norm": 1.0222299098968506, "learning_rate": 1.0368980612883053e-05, "loss": 0.9027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 830, "tokens_per_second_per_gpu": 16888.37, "total_tokens": 82071189 }, { "epoch": 0.051950487621905475, "grad_norm": 0.980967104434967, "learning_rate": 1.0381488430268918e-05, "loss": 0.9266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 831, "tokens_per_second_per_gpu": 16771.51, "total_tokens": 82173042 }, { "epoch": 0.052013003250812706, "grad_norm": 1.0280736684799194, "learning_rate": 1.0393996247654785e-05, "loss": 1.0127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 832, "tokens_per_second_per_gpu": 16993.37, "total_tokens": 82270752 }, { "epoch": 0.05207551887971993, "grad_norm": 0.994823694229126, "learning_rate": 1.0406504065040652e-05, "loss": 0.9566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 833, "tokens_per_second_per_gpu": 17503.41, "total_tokens": 82371455 }, { "epoch": 0.052138034508627154, "grad_norm": 0.9977983832359314, "learning_rate": 1.0419011882426517e-05, "loss": 0.9231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 834, "tokens_per_second_per_gpu": 17675.24, "total_tokens": 82471829 }, { "epoch": 0.052200550137534385, "grad_norm": 1.1759487390518188, "learning_rate": 1.0431519699812385e-05, "loss": 0.9356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 835, "tokens_per_second_per_gpu": 16806.88, "total_tokens": 82569210 }, { "epoch": 0.05226306576644161, "grad_norm": 0.9959380626678467, "learning_rate": 1.0444027517198249e-05, "loss": 0.9395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 836, "tokens_per_second_per_gpu": 16483.31, "total_tokens": 82668743 }, { "epoch": 0.05232558139534884, "grad_norm": 1.0043517351150513, "learning_rate": 1.0456535334584118e-05, "loss": 0.9684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 837, "tokens_per_second_per_gpu": 18005.92, "total_tokens": 82770398 }, { "epoch": 0.052388097024256064, "grad_norm": 0.9999526739120483, "learning_rate": 1.0469043151969983e-05, "loss": 0.9332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 838, "tokens_per_second_per_gpu": 16273.74, "total_tokens": 82864682 }, { "epoch": 0.05245061265316329, "grad_norm": 1.0827741622924805, "learning_rate": 1.048155096935585e-05, "loss": 0.9533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 839, "tokens_per_second_per_gpu": 17367.5, "total_tokens": 82963394 }, { "epoch": 0.05251312828207052, "grad_norm": 1.0353502035140991, "learning_rate": 1.0494058786741715e-05, "loss": 0.9412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 840, "tokens_per_second_per_gpu": 18090.36, "total_tokens": 83062451 }, { "epoch": 0.05257564391097774, "grad_norm": 1.0406179428100586, "learning_rate": 1.0506566604127582e-05, "loss": 0.9419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 841, "tokens_per_second_per_gpu": 17570.69, "total_tokens": 83163387 }, { "epoch": 0.052638159539884974, "grad_norm": 1.0680601596832275, "learning_rate": 1.0519074421513447e-05, "loss": 0.9363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 842, "tokens_per_second_per_gpu": 18390.95, "total_tokens": 83265710 }, { "epoch": 0.0527006751687922, "grad_norm": 1.0018161535263062, "learning_rate": 1.0531582238899314e-05, "loss": 0.942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 843, "tokens_per_second_per_gpu": 16937.46, "total_tokens": 83364432 }, { "epoch": 0.05276319079769942, "grad_norm": 1.0169367790222168, "learning_rate": 1.0544090056285179e-05, "loss": 0.9807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 844, "tokens_per_second_per_gpu": 18343.6, "total_tokens": 83467536 }, { "epoch": 0.05282570642660665, "grad_norm": 1.020241379737854, "learning_rate": 1.0556597873671046e-05, "loss": 0.9863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 845, "tokens_per_second_per_gpu": 18301.25, "total_tokens": 83571781 }, { "epoch": 0.05288822205551388, "grad_norm": 1.0087140798568726, "learning_rate": 1.0569105691056911e-05, "loss": 0.9562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 846, "tokens_per_second_per_gpu": 18060.73, "total_tokens": 83672919 }, { "epoch": 0.05295073768442111, "grad_norm": 1.0391619205474854, "learning_rate": 1.0581613508442778e-05, "loss": 0.9284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 847, "tokens_per_second_per_gpu": 17274.3, "total_tokens": 83772909 }, { "epoch": 0.05301325331332833, "grad_norm": 1.059389352798462, "learning_rate": 1.0594121325828643e-05, "loss": 1.0031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 848, "tokens_per_second_per_gpu": 17638.18, "total_tokens": 83874127 }, { "epoch": 0.053075768942235556, "grad_norm": 1.07355797290802, "learning_rate": 1.060662914321451e-05, "loss": 0.9693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 849, "tokens_per_second_per_gpu": 17170.7, "total_tokens": 83973440 }, { "epoch": 0.05313828457114279, "grad_norm": 1.0492995977401733, "learning_rate": 1.0619136960600375e-05, "loss": 0.9906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 850, "tokens_per_second_per_gpu": 18374.06, "total_tokens": 84072675 }, { "epoch": 0.05320080020005001, "grad_norm": 1.0284463167190552, "learning_rate": 1.0631644777986244e-05, "loss": 0.9496, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 851, "tokens_per_second_per_gpu": 16772.55, "total_tokens": 84167018 }, { "epoch": 0.05326331582895724, "grad_norm": 1.0468816757202148, "learning_rate": 1.0644152595372107e-05, "loss": 0.9737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 852, "tokens_per_second_per_gpu": 15665.81, "total_tokens": 84263949 }, { "epoch": 0.053325831457864466, "grad_norm": 1.1397314071655273, "learning_rate": 1.0656660412757976e-05, "loss": 0.9307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 853, "tokens_per_second_per_gpu": 16425.16, "total_tokens": 84362068 }, { "epoch": 0.05338834708677169, "grad_norm": 1.019189476966858, "learning_rate": 1.0669168230143841e-05, "loss": 0.9727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 854, "tokens_per_second_per_gpu": 16711.77, "total_tokens": 84461209 }, { "epoch": 0.05345086271567892, "grad_norm": 0.9736299514770508, "learning_rate": 1.0681676047529708e-05, "loss": 0.8969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 855, "tokens_per_second_per_gpu": 17777.45, "total_tokens": 84560852 }, { "epoch": 0.053513378344586145, "grad_norm": 1.0223617553710938, "learning_rate": 1.0694183864915573e-05, "loss": 0.9967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 856, "tokens_per_second_per_gpu": 18357.46, "total_tokens": 84663970 }, { "epoch": 0.053575893973493376, "grad_norm": 1.0261772871017456, "learning_rate": 1.070669168230144e-05, "loss": 0.9163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 857, "tokens_per_second_per_gpu": 17023.79, "total_tokens": 84758636 }, { "epoch": 0.0536384096024006, "grad_norm": 0.9829002022743225, "learning_rate": 1.0719199499687305e-05, "loss": 0.9335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 858, "tokens_per_second_per_gpu": 17763.16, "total_tokens": 84861378 }, { "epoch": 0.053700925231307824, "grad_norm": 1.0690892934799194, "learning_rate": 1.0731707317073172e-05, "loss": 0.9966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 859, "tokens_per_second_per_gpu": 18033.58, "total_tokens": 84963028 }, { "epoch": 0.053763440860215055, "grad_norm": 1.0821057558059692, "learning_rate": 1.0744215134459037e-05, "loss": 0.9501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 860, "tokens_per_second_per_gpu": 17421.0, "total_tokens": 85057403 }, { "epoch": 0.05382595648912228, "grad_norm": 1.013401746749878, "learning_rate": 1.0756722951844904e-05, "loss": 0.9415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 861, "tokens_per_second_per_gpu": 17505.72, "total_tokens": 85158877 }, { "epoch": 0.05388847211802951, "grad_norm": 1.0187870264053345, "learning_rate": 1.076923076923077e-05, "loss": 0.9523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 862, "tokens_per_second_per_gpu": 18538.47, "total_tokens": 85261468 }, { "epoch": 0.053950987746936734, "grad_norm": 1.0171819925308228, "learning_rate": 1.0781738586616636e-05, "loss": 0.8871, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 863, "tokens_per_second_per_gpu": 16811.06, "total_tokens": 85356281 }, { "epoch": 0.05401350337584396, "grad_norm": 1.0150607824325562, "learning_rate": 1.0794246404002501e-05, "loss": 0.9293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 864, "tokens_per_second_per_gpu": 17262.61, "total_tokens": 85450889 }, { "epoch": 0.05407601900475119, "grad_norm": 1.0986183881759644, "learning_rate": 1.080675422138837e-05, "loss": 0.9829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 865, "tokens_per_second_per_gpu": 16922.32, "total_tokens": 85550099 }, { "epoch": 0.05413853463365841, "grad_norm": 1.040414571762085, "learning_rate": 1.0819262038774233e-05, "loss": 0.9629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 866, "tokens_per_second_per_gpu": 17490.01, "total_tokens": 85650061 }, { "epoch": 0.05420105026256564, "grad_norm": 1.0853382349014282, "learning_rate": 1.0831769856160102e-05, "loss": 0.9521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 867, "tokens_per_second_per_gpu": 16978.64, "total_tokens": 85746923 }, { "epoch": 0.05426356589147287, "grad_norm": 1.0475523471832275, "learning_rate": 1.0844277673545967e-05, "loss": 0.9884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 868, "tokens_per_second_per_gpu": 17928.89, "total_tokens": 85847091 }, { "epoch": 0.0543260815203801, "grad_norm": 1.3058865070343018, "learning_rate": 1.0856785490931834e-05, "loss": 0.9667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 869, "tokens_per_second_per_gpu": 17170.64, "total_tokens": 85947773 }, { "epoch": 0.05438859714928732, "grad_norm": 1.0164051055908203, "learning_rate": 1.08692933083177e-05, "loss": 0.9311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 870, "tokens_per_second_per_gpu": 17356.48, "total_tokens": 86044347 }, { "epoch": 0.054451112778194546, "grad_norm": 1.0040271282196045, "learning_rate": 1.0881801125703566e-05, "loss": 0.9769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 871, "tokens_per_second_per_gpu": 18029.89, "total_tokens": 86143935 }, { "epoch": 0.05451362840710178, "grad_norm": 1.0856165885925293, "learning_rate": 1.0894308943089431e-05, "loss": 0.9727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 872, "tokens_per_second_per_gpu": 17891.85, "total_tokens": 86247052 }, { "epoch": 0.054576144036009, "grad_norm": 1.122954249382019, "learning_rate": 1.0906816760475298e-05, "loss": 0.9883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 873, "tokens_per_second_per_gpu": 17852.75, "total_tokens": 86349563 }, { "epoch": 0.05463865966491623, "grad_norm": 1.0545402765274048, "learning_rate": 1.0919324577861163e-05, "loss": 0.9557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 874, "tokens_per_second_per_gpu": 17431.13, "total_tokens": 86449658 }, { "epoch": 0.054701175293823456, "grad_norm": 1.0893816947937012, "learning_rate": 1.093183239524703e-05, "loss": 0.9101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 875, "tokens_per_second_per_gpu": 16800.48, "total_tokens": 86540874 }, { "epoch": 0.05476369092273068, "grad_norm": 1.0028557777404785, "learning_rate": 1.0944340212632896e-05, "loss": 0.9632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 876, "tokens_per_second_per_gpu": 17805.03, "total_tokens": 86642545 }, { "epoch": 0.05482620655163791, "grad_norm": 1.0248699188232422, "learning_rate": 1.0956848030018762e-05, "loss": 0.9582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 877, "tokens_per_second_per_gpu": 17872.32, "total_tokens": 86744387 }, { "epoch": 0.054888722180545135, "grad_norm": 0.9785093069076538, "learning_rate": 1.0969355847404628e-05, "loss": 0.9033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 878, "tokens_per_second_per_gpu": 17495.24, "total_tokens": 86841985 }, { "epoch": 0.054951237809452366, "grad_norm": 1.0800327062606812, "learning_rate": 1.0981863664790495e-05, "loss": 0.9528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 879, "tokens_per_second_per_gpu": 17483.86, "total_tokens": 86943908 }, { "epoch": 0.05501375343835959, "grad_norm": 1.1863099336624146, "learning_rate": 1.099437148217636e-05, "loss": 0.9604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 880, "tokens_per_second_per_gpu": 17922.46, "total_tokens": 87043953 }, { "epoch": 0.055076269067266814, "grad_norm": 1.0216132402420044, "learning_rate": 1.1006879299562228e-05, "loss": 0.8997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 881, "tokens_per_second_per_gpu": 16716.06, "total_tokens": 87139404 }, { "epoch": 0.055138784696174045, "grad_norm": 1.0250781774520874, "learning_rate": 1.1019387116948092e-05, "loss": 0.9188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 882, "tokens_per_second_per_gpu": 16739.41, "total_tokens": 87231193 }, { "epoch": 0.05520130032508127, "grad_norm": 1.0053526163101196, "learning_rate": 1.103189493433396e-05, "loss": 0.8941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 883, "tokens_per_second_per_gpu": 16434.0, "total_tokens": 87326361 }, { "epoch": 0.0552638159539885, "grad_norm": 0.9914777874946594, "learning_rate": 1.1044402751719826e-05, "loss": 0.9348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 884, "tokens_per_second_per_gpu": 17861.75, "total_tokens": 87425748 }, { "epoch": 0.055326331582895724, "grad_norm": 1.0050548315048218, "learning_rate": 1.1056910569105692e-05, "loss": 0.996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 885, "tokens_per_second_per_gpu": 17249.69, "total_tokens": 87527731 }, { "epoch": 0.05538884721180295, "grad_norm": 0.9961769580841064, "learning_rate": 1.1069418386491558e-05, "loss": 0.8859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 886, "tokens_per_second_per_gpu": 17722.47, "total_tokens": 87624745 }, { "epoch": 0.05545136284071018, "grad_norm": 1.0181888341903687, "learning_rate": 1.1081926203877425e-05, "loss": 0.9381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 887, "tokens_per_second_per_gpu": 18174.68, "total_tokens": 87724815 }, { "epoch": 0.0555138784696174, "grad_norm": 1.0602366924285889, "learning_rate": 1.109443402126329e-05, "loss": 0.9415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 888, "tokens_per_second_per_gpu": 16525.39, "total_tokens": 87819836 }, { "epoch": 0.055576394098524634, "grad_norm": 1.040725827217102, "learning_rate": 1.1106941838649157e-05, "loss": 0.9387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 889, "tokens_per_second_per_gpu": 16928.36, "total_tokens": 87916616 }, { "epoch": 0.05563890972743186, "grad_norm": 1.0302475690841675, "learning_rate": 1.1119449656035022e-05, "loss": 0.9773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 890, "tokens_per_second_per_gpu": 17708.44, "total_tokens": 88018249 }, { "epoch": 0.05570142535633908, "grad_norm": 0.9884721040725708, "learning_rate": 1.1131957473420889e-05, "loss": 0.9625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 891, "tokens_per_second_per_gpu": 17626.37, "total_tokens": 88116562 }, { "epoch": 0.05576394098524631, "grad_norm": 1.0461924076080322, "learning_rate": 1.1144465290806754e-05, "loss": 0.9806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 892, "tokens_per_second_per_gpu": 17097.66, "total_tokens": 88212701 }, { "epoch": 0.05582645661415354, "grad_norm": 0.9625095725059509, "learning_rate": 1.115697310819262e-05, "loss": 0.9426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 893, "tokens_per_second_per_gpu": 17485.92, "total_tokens": 88315474 }, { "epoch": 0.05588897224306077, "grad_norm": 1.0617613792419434, "learning_rate": 1.1169480925578486e-05, "loss": 0.9314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 894, "tokens_per_second_per_gpu": 15729.68, "total_tokens": 88410761 }, { "epoch": 0.05595148787196799, "grad_norm": 1.041585922241211, "learning_rate": 1.1181988742964354e-05, "loss": 0.9586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 895, "tokens_per_second_per_gpu": 16916.93, "total_tokens": 88505470 }, { "epoch": 0.056014003500875216, "grad_norm": 1.0385459661483765, "learning_rate": 1.1194496560350218e-05, "loss": 0.959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 896, "tokens_per_second_per_gpu": 17620.42, "total_tokens": 88605128 }, { "epoch": 0.05607651912978245, "grad_norm": 1.019511342048645, "learning_rate": 1.1207004377736087e-05, "loss": 0.929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 897, "tokens_per_second_per_gpu": 18192.28, "total_tokens": 88702104 }, { "epoch": 0.05613903475868967, "grad_norm": 1.0274710655212402, "learning_rate": 1.1219512195121953e-05, "loss": 0.8967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 898, "tokens_per_second_per_gpu": 17096.54, "total_tokens": 88797019 }, { "epoch": 0.0562015503875969, "grad_norm": 1.067718267440796, "learning_rate": 1.1232020012507819e-05, "loss": 0.9648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 899, "tokens_per_second_per_gpu": 16677.27, "total_tokens": 88893109 }, { "epoch": 0.056264066016504126, "grad_norm": 0.9696674346923828, "learning_rate": 1.1244527829893686e-05, "loss": 0.9057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 900, "tokens_per_second_per_gpu": 17738.47, "total_tokens": 88991011 }, { "epoch": 0.05632658164541135, "grad_norm": 1.0735628604888916, "learning_rate": 1.125703564727955e-05, "loss": 0.9295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 901, "tokens_per_second_per_gpu": 16988.72, "total_tokens": 89086024 }, { "epoch": 0.05638909727431858, "grad_norm": 1.0400105714797974, "learning_rate": 1.1269543464665418e-05, "loss": 0.9478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 902, "tokens_per_second_per_gpu": 17333.75, "total_tokens": 89182336 }, { "epoch": 0.056451612903225805, "grad_norm": 1.0759153366088867, "learning_rate": 1.1282051282051283e-05, "loss": 0.9233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 903, "tokens_per_second_per_gpu": 17047.05, "total_tokens": 89278211 }, { "epoch": 0.056514128532133036, "grad_norm": 0.9555583000183105, "learning_rate": 1.129455909943715e-05, "loss": 0.919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 904, "tokens_per_second_per_gpu": 17384.18, "total_tokens": 89376868 }, { "epoch": 0.05657664416104026, "grad_norm": 1.0198967456817627, "learning_rate": 1.1307066916823015e-05, "loss": 0.9702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 905, "tokens_per_second_per_gpu": 17171.33, "total_tokens": 89476032 }, { "epoch": 0.056639159789947484, "grad_norm": 1.0236849784851074, "learning_rate": 1.1319574734208882e-05, "loss": 0.9412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 906, "tokens_per_second_per_gpu": 17304.81, "total_tokens": 89577652 }, { "epoch": 0.056701675418854715, "grad_norm": 0.9722591638565063, "learning_rate": 1.1332082551594747e-05, "loss": 0.9483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 907, "tokens_per_second_per_gpu": 17442.08, "total_tokens": 89679223 }, { "epoch": 0.05676419104776194, "grad_norm": 1.0597522258758545, "learning_rate": 1.1344590368980616e-05, "loss": 0.9289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 908, "tokens_per_second_per_gpu": 16859.18, "total_tokens": 89776416 }, { "epoch": 0.05682670667666917, "grad_norm": 1.057592511177063, "learning_rate": 1.1357098186366479e-05, "loss": 0.8978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 909, "tokens_per_second_per_gpu": 16998.79, "total_tokens": 89872538 }, { "epoch": 0.056889222305576394, "grad_norm": 1.0001637935638428, "learning_rate": 1.1369606003752348e-05, "loss": 0.9322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 910, "tokens_per_second_per_gpu": 17628.46, "total_tokens": 89970773 }, { "epoch": 0.05695173793448362, "grad_norm": 1.0159491300582886, "learning_rate": 1.1382113821138213e-05, "loss": 0.9064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 911, "tokens_per_second_per_gpu": 17118.0, "total_tokens": 90069287 }, { "epoch": 0.05701425356339085, "grad_norm": 0.997652530670166, "learning_rate": 1.139462163852408e-05, "loss": 0.8847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 912, "tokens_per_second_per_gpu": 17100.8, "total_tokens": 90163229 }, { "epoch": 0.05707676919229807, "grad_norm": 1.0591034889221191, "learning_rate": 1.1407129455909945e-05, "loss": 0.9886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 913, "tokens_per_second_per_gpu": 18266.99, "total_tokens": 90266553 }, { "epoch": 0.057139284821205304, "grad_norm": 0.9876044988632202, "learning_rate": 1.1419637273295812e-05, "loss": 0.8971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 914, "tokens_per_second_per_gpu": 16471.63, "total_tokens": 90362752 }, { "epoch": 0.05720180045011253, "grad_norm": 1.0394517183303833, "learning_rate": 1.1432145090681677e-05, "loss": 0.9439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 915, "tokens_per_second_per_gpu": 16636.42, "total_tokens": 90459410 }, { "epoch": 0.05726431607901975, "grad_norm": 1.1235467195510864, "learning_rate": 1.1444652908067544e-05, "loss": 0.9455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 916, "tokens_per_second_per_gpu": 16661.74, "total_tokens": 90553325 }, { "epoch": 0.05732683170792698, "grad_norm": 1.073258399963379, "learning_rate": 1.1457160725453409e-05, "loss": 0.966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 917, "tokens_per_second_per_gpu": 16966.67, "total_tokens": 90650935 }, { "epoch": 0.05738934733683421, "grad_norm": 1.0275664329528809, "learning_rate": 1.1469668542839276e-05, "loss": 0.9839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 918, "tokens_per_second_per_gpu": 17657.87, "total_tokens": 90752692 }, { "epoch": 0.05745186296574144, "grad_norm": 0.9978224039077759, "learning_rate": 1.1482176360225141e-05, "loss": 0.9166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 919, "tokens_per_second_per_gpu": 17608.69, "total_tokens": 90853354 }, { "epoch": 0.05751437859464866, "grad_norm": 1.0160077810287476, "learning_rate": 1.1494684177611008e-05, "loss": 1.0448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 920, "tokens_per_second_per_gpu": 17991.39, "total_tokens": 90952689 }, { "epoch": 0.057576894223555886, "grad_norm": 1.0166703462600708, "learning_rate": 1.1507191994996873e-05, "loss": 0.8996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 921, "tokens_per_second_per_gpu": 15573.04, "total_tokens": 91046125 }, { "epoch": 0.05763940985246312, "grad_norm": 1.007768154144287, "learning_rate": 1.151969981238274e-05, "loss": 0.8674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 922, "tokens_per_second_per_gpu": 17377.76, "total_tokens": 91145505 }, { "epoch": 0.05770192548137034, "grad_norm": 0.9905183911323547, "learning_rate": 1.1532207629768605e-05, "loss": 0.9178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 923, "tokens_per_second_per_gpu": 16887.88, "total_tokens": 91246809 }, { "epoch": 0.05776444111027757, "grad_norm": 1.02533757686615, "learning_rate": 1.1544715447154474e-05, "loss": 0.9693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 924, "tokens_per_second_per_gpu": 17030.43, "total_tokens": 91345270 }, { "epoch": 0.057826956739184796, "grad_norm": 1.0279325246810913, "learning_rate": 1.1557223264540337e-05, "loss": 0.976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 925, "tokens_per_second_per_gpu": 17682.38, "total_tokens": 91445302 }, { "epoch": 0.057889472368092026, "grad_norm": 0.9985703229904175, "learning_rate": 1.1569731081926206e-05, "loss": 0.9491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 926, "tokens_per_second_per_gpu": 17629.13, "total_tokens": 91541911 }, { "epoch": 0.05795198799699925, "grad_norm": 1.008923888206482, "learning_rate": 1.1582238899312071e-05, "loss": 0.9158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 927, "tokens_per_second_per_gpu": 17286.16, "total_tokens": 91636675 }, { "epoch": 0.058014503625906474, "grad_norm": 1.1096125841140747, "learning_rate": 1.1594746716697938e-05, "loss": 0.9302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 928, "tokens_per_second_per_gpu": 16405.39, "total_tokens": 91729842 }, { "epoch": 0.058077019254813705, "grad_norm": 1.0683645009994507, "learning_rate": 1.1607254534083803e-05, "loss": 0.9763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 929, "tokens_per_second_per_gpu": 18202.91, "total_tokens": 91830729 }, { "epoch": 0.05813953488372093, "grad_norm": 0.9987587928771973, "learning_rate": 1.161976235146967e-05, "loss": 0.9213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 930, "tokens_per_second_per_gpu": 17100.56, "total_tokens": 91930594 }, { "epoch": 0.05820205051262816, "grad_norm": 1.033536672592163, "learning_rate": 1.1632270168855535e-05, "loss": 0.9124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 931, "tokens_per_second_per_gpu": 17226.42, "total_tokens": 92023668 }, { "epoch": 0.058264566141535384, "grad_norm": 1.0038982629776, "learning_rate": 1.1644777986241402e-05, "loss": 0.8997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 932, "tokens_per_second_per_gpu": 16643.56, "total_tokens": 92117857 }, { "epoch": 0.05832708177044261, "grad_norm": 1.0041189193725586, "learning_rate": 1.1657285803627267e-05, "loss": 0.9389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 933, "tokens_per_second_per_gpu": 16856.71, "total_tokens": 92213589 }, { "epoch": 0.05838959739934984, "grad_norm": 1.0535776615142822, "learning_rate": 1.1669793621013134e-05, "loss": 0.973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 934, "tokens_per_second_per_gpu": 16196.48, "total_tokens": 92310606 }, { "epoch": 0.05845211302825706, "grad_norm": 1.036463737487793, "learning_rate": 1.1682301438399e-05, "loss": 0.9895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 935, "tokens_per_second_per_gpu": 17403.81, "total_tokens": 92410690 }, { "epoch": 0.058514628657164294, "grad_norm": 0.9737445116043091, "learning_rate": 1.1694809255784866e-05, "loss": 0.9684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 936, "tokens_per_second_per_gpu": 18596.44, "total_tokens": 92514174 }, { "epoch": 0.05857714428607152, "grad_norm": 1.0090948343276978, "learning_rate": 1.1707317073170731e-05, "loss": 0.9321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 937, "tokens_per_second_per_gpu": 16458.05, "total_tokens": 92611026 }, { "epoch": 0.05863965991497874, "grad_norm": 1.1541048288345337, "learning_rate": 1.17198248905566e-05, "loss": 0.9331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 938, "tokens_per_second_per_gpu": 17419.53, "total_tokens": 92710564 }, { "epoch": 0.05870217554388597, "grad_norm": 0.9745938181877136, "learning_rate": 1.1732332707942464e-05, "loss": 0.9097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 939, "tokens_per_second_per_gpu": 17152.8, "total_tokens": 92809506 }, { "epoch": 0.0587646911727932, "grad_norm": 1.0317859649658203, "learning_rate": 1.1744840525328332e-05, "loss": 0.9114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 940, "tokens_per_second_per_gpu": 16153.68, "total_tokens": 92904193 }, { "epoch": 0.05882720680170043, "grad_norm": 1.0293720960617065, "learning_rate": 1.1757348342714197e-05, "loss": 0.9455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 941, "tokens_per_second_per_gpu": 17224.39, "total_tokens": 93002538 }, { "epoch": 0.05888972243060765, "grad_norm": 1.0209516286849976, "learning_rate": 1.1769856160100064e-05, "loss": 0.9646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 942, "tokens_per_second_per_gpu": 18018.91, "total_tokens": 93103733 }, { "epoch": 0.058952238059514876, "grad_norm": 1.0459585189819336, "learning_rate": 1.178236397748593e-05, "loss": 0.9639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 943, "tokens_per_second_per_gpu": 17384.65, "total_tokens": 93206036 }, { "epoch": 0.05901475368842211, "grad_norm": 1.0337706804275513, "learning_rate": 1.1794871794871796e-05, "loss": 0.9475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 944, "tokens_per_second_per_gpu": 17570.01, "total_tokens": 93304736 }, { "epoch": 0.05907726931732933, "grad_norm": 1.0175576210021973, "learning_rate": 1.1807379612257661e-05, "loss": 0.9118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 945, "tokens_per_second_per_gpu": 17530.15, "total_tokens": 93401611 }, { "epoch": 0.05913978494623656, "grad_norm": 0.966780424118042, "learning_rate": 1.1819887429643528e-05, "loss": 0.9527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 946, "tokens_per_second_per_gpu": 17505.72, "total_tokens": 93502380 }, { "epoch": 0.059202300575143786, "grad_norm": 0.9925317168235779, "learning_rate": 1.1832395247029394e-05, "loss": 0.9275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 947, "tokens_per_second_per_gpu": 17769.11, "total_tokens": 93602423 }, { "epoch": 0.05926481620405101, "grad_norm": 1.0453920364379883, "learning_rate": 1.184490306441526e-05, "loss": 0.9541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 948, "tokens_per_second_per_gpu": 17261.77, "total_tokens": 93698545 }, { "epoch": 0.05932733183295824, "grad_norm": 1.0264264345169067, "learning_rate": 1.1857410881801126e-05, "loss": 0.9346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 949, "tokens_per_second_per_gpu": 17929.05, "total_tokens": 93799853 }, { "epoch": 0.059389847461865465, "grad_norm": 0.9623929858207703, "learning_rate": 1.1869918699186992e-05, "loss": 0.9113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 950, "tokens_per_second_per_gpu": 17602.83, "total_tokens": 93898776 }, { "epoch": 0.059452363090772696, "grad_norm": 0.9622788429260254, "learning_rate": 1.1882426516572858e-05, "loss": 0.9448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 951, "tokens_per_second_per_gpu": 18483.19, "total_tokens": 93999024 }, { "epoch": 0.05951487871967992, "grad_norm": 0.9936200976371765, "learning_rate": 1.1894934333958725e-05, "loss": 0.9043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 952, "tokens_per_second_per_gpu": 16386.24, "total_tokens": 94097089 }, { "epoch": 0.059577394348587144, "grad_norm": 0.9732035398483276, "learning_rate": 1.190744215134459e-05, "loss": 0.9491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 953, "tokens_per_second_per_gpu": 17541.39, "total_tokens": 94197468 }, { "epoch": 0.059639909977494375, "grad_norm": 1.059558391571045, "learning_rate": 1.1919949968730458e-05, "loss": 0.9032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 954, "tokens_per_second_per_gpu": 15622.72, "total_tokens": 94288908 }, { "epoch": 0.0597024256064016, "grad_norm": 1.0735541582107544, "learning_rate": 1.1932457786116322e-05, "loss": 0.94, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 955, "tokens_per_second_per_gpu": 16374.14, "total_tokens": 94383260 }, { "epoch": 0.05976494123530883, "grad_norm": 1.0191161632537842, "learning_rate": 1.194496560350219e-05, "loss": 0.9709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 956, "tokens_per_second_per_gpu": 16771.43, "total_tokens": 94479636 }, { "epoch": 0.059827456864216054, "grad_norm": 1.0059351921081543, "learning_rate": 1.1957473420888056e-05, "loss": 0.935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 957, "tokens_per_second_per_gpu": 16380.88, "total_tokens": 94574568 }, { "epoch": 0.05988997249312328, "grad_norm": 0.9920620918273926, "learning_rate": 1.1969981238273922e-05, "loss": 0.901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 958, "tokens_per_second_per_gpu": 17118.84, "total_tokens": 94675034 }, { "epoch": 0.05995248812203051, "grad_norm": 0.9893505573272705, "learning_rate": 1.1982489055659788e-05, "loss": 0.8824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 959, "tokens_per_second_per_gpu": 16957.78, "total_tokens": 94771609 }, { "epoch": 0.06001500375093773, "grad_norm": 1.043122410774231, "learning_rate": 1.1994996873045655e-05, "loss": 0.9343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 960, "tokens_per_second_per_gpu": 16628.94, "total_tokens": 94868042 }, { "epoch": 0.060077519379844964, "grad_norm": 1.023403525352478, "learning_rate": 1.2007504690431521e-05, "loss": 0.9079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 961, "tokens_per_second_per_gpu": 18007.51, "total_tokens": 94968740 }, { "epoch": 0.06014003500875219, "grad_norm": 1.011742353439331, "learning_rate": 1.2020012507817387e-05, "loss": 0.9189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 962, "tokens_per_second_per_gpu": 17283.82, "total_tokens": 95066939 }, { "epoch": 0.06020255063765941, "grad_norm": 1.0050654411315918, "learning_rate": 1.2032520325203254e-05, "loss": 0.9162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 963, "tokens_per_second_per_gpu": 17567.31, "total_tokens": 95164733 }, { "epoch": 0.06026506626656664, "grad_norm": 1.0017096996307373, "learning_rate": 1.2045028142589119e-05, "loss": 0.9158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 964, "tokens_per_second_per_gpu": 17529.72, "total_tokens": 95259128 }, { "epoch": 0.06032758189547387, "grad_norm": 1.0126692056655884, "learning_rate": 1.2057535959974986e-05, "loss": 0.9467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 965, "tokens_per_second_per_gpu": 19128.24, "total_tokens": 95361291 }, { "epoch": 0.0603900975243811, "grad_norm": 0.9744247794151306, "learning_rate": 1.207004377736085e-05, "loss": 0.9093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 966, "tokens_per_second_per_gpu": 15985.52, "total_tokens": 95456027 }, { "epoch": 0.06045261315328832, "grad_norm": 0.9711417555809021, "learning_rate": 1.208255159474672e-05, "loss": 0.9539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 967, "tokens_per_second_per_gpu": 17907.09, "total_tokens": 95554962 }, { "epoch": 0.060515128782195546, "grad_norm": 1.0153273344039917, "learning_rate": 1.2095059412132585e-05, "loss": 0.9155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 968, "tokens_per_second_per_gpu": 17212.92, "total_tokens": 95654982 }, { "epoch": 0.06057764441110278, "grad_norm": 1.2298016548156738, "learning_rate": 1.2107567229518451e-05, "loss": 0.9528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 969, "tokens_per_second_per_gpu": 17237.5, "total_tokens": 95753424 }, { "epoch": 0.06064016004001, "grad_norm": 1.0187987089157104, "learning_rate": 1.2120075046904317e-05, "loss": 0.9172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 970, "tokens_per_second_per_gpu": 16925.92, "total_tokens": 95847750 }, { "epoch": 0.06070267566891723, "grad_norm": 1.01676607131958, "learning_rate": 1.2132582864290184e-05, "loss": 0.9196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 971, "tokens_per_second_per_gpu": 16694.01, "total_tokens": 95942952 }, { "epoch": 0.060765191297824456, "grad_norm": 1.0065252780914307, "learning_rate": 1.2145090681676049e-05, "loss": 0.9418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 972, "tokens_per_second_per_gpu": 17186.5, "total_tokens": 96041696 }, { "epoch": 0.06082770692673168, "grad_norm": 1.0001071691513062, "learning_rate": 1.2157598499061916e-05, "loss": 0.9573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 973, "tokens_per_second_per_gpu": 17809.96, "total_tokens": 96141617 }, { "epoch": 0.06089022255563891, "grad_norm": 1.0181587934494019, "learning_rate": 1.217010631644778e-05, "loss": 0.9409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 974, "tokens_per_second_per_gpu": 16333.99, "total_tokens": 96238399 }, { "epoch": 0.060952738184546135, "grad_norm": 1.0095598697662354, "learning_rate": 1.2182614133833648e-05, "loss": 0.935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 975, "tokens_per_second_per_gpu": 18162.02, "total_tokens": 96342748 }, { "epoch": 0.061015253813453366, "grad_norm": 0.9884899854660034, "learning_rate": 1.2195121951219513e-05, "loss": 0.9404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 976, "tokens_per_second_per_gpu": 17396.82, "total_tokens": 96441533 }, { "epoch": 0.06107776944236059, "grad_norm": 0.9984365105628967, "learning_rate": 1.220762976860538e-05, "loss": 0.8901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 977, "tokens_per_second_per_gpu": 16728.42, "total_tokens": 96538407 }, { "epoch": 0.061140285071267814, "grad_norm": 1.0560882091522217, "learning_rate": 1.2220137585991245e-05, "loss": 0.915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 978, "tokens_per_second_per_gpu": 16331.18, "total_tokens": 96635441 }, { "epoch": 0.061202800700175045, "grad_norm": 1.0153456926345825, "learning_rate": 1.2232645403377112e-05, "loss": 0.933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 979, "tokens_per_second_per_gpu": 17636.88, "total_tokens": 96735325 }, { "epoch": 0.06126531632908227, "grad_norm": 1.0559295415878296, "learning_rate": 1.2245153220762977e-05, "loss": 0.9247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 980, "tokens_per_second_per_gpu": 16364.96, "total_tokens": 96831863 }, { "epoch": 0.0613278319579895, "grad_norm": 1.040934443473816, "learning_rate": 1.2257661038148846e-05, "loss": 0.9491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 981, "tokens_per_second_per_gpu": 16713.97, "total_tokens": 96927132 }, { "epoch": 0.061390347586896724, "grad_norm": 1.0571544170379639, "learning_rate": 1.2270168855534709e-05, "loss": 1.0011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 982, "tokens_per_second_per_gpu": 17964.62, "total_tokens": 97025230 }, { "epoch": 0.06145286321580395, "grad_norm": 0.9920820593833923, "learning_rate": 1.2282676672920578e-05, "loss": 0.9011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 983, "tokens_per_second_per_gpu": 15975.15, "total_tokens": 97122191 }, { "epoch": 0.06151537884471118, "grad_norm": 1.0675461292266846, "learning_rate": 1.2295184490306443e-05, "loss": 0.9839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 984, "tokens_per_second_per_gpu": 17153.44, "total_tokens": 97221326 }, { "epoch": 0.0615778944736184, "grad_norm": 0.9620205163955688, "learning_rate": 1.230769230769231e-05, "loss": 0.9115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 985, "tokens_per_second_per_gpu": 17449.73, "total_tokens": 97318550 }, { "epoch": 0.06164041010252563, "grad_norm": 1.0153425931930542, "learning_rate": 1.2320200125078175e-05, "loss": 0.8774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 986, "tokens_per_second_per_gpu": 16910.82, "total_tokens": 97415108 }, { "epoch": 0.06170292573143286, "grad_norm": 1.007986068725586, "learning_rate": 1.2332707942464042e-05, "loss": 0.9317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 987, "tokens_per_second_per_gpu": 17472.06, "total_tokens": 97514082 }, { "epoch": 0.06176544136034009, "grad_norm": 0.979292094707489, "learning_rate": 1.2345215759849907e-05, "loss": 0.885, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 988, "tokens_per_second_per_gpu": 17507.7, "total_tokens": 97609756 }, { "epoch": 0.06182795698924731, "grad_norm": 0.9741080403327942, "learning_rate": 1.2357723577235774e-05, "loss": 0.916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 989, "tokens_per_second_per_gpu": 17927.03, "total_tokens": 97710807 }, { "epoch": 0.061890472618154536, "grad_norm": 1.0108211040496826, "learning_rate": 1.2370231394621639e-05, "loss": 0.8956, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 990, "tokens_per_second_per_gpu": 17609.2, "total_tokens": 97809625 }, { "epoch": 0.06195298824706177, "grad_norm": 1.0655699968338013, "learning_rate": 1.2382739212007506e-05, "loss": 0.9194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 991, "tokens_per_second_per_gpu": 17078.79, "total_tokens": 97905824 }, { "epoch": 0.06201550387596899, "grad_norm": 0.9866424798965454, "learning_rate": 1.2395247029393371e-05, "loss": 0.8807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 992, "tokens_per_second_per_gpu": 17614.32, "total_tokens": 98004882 }, { "epoch": 0.06207801950487622, "grad_norm": 1.004819631576538, "learning_rate": 1.2407754846779238e-05, "loss": 0.8998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 993, "tokens_per_second_per_gpu": 17437.14, "total_tokens": 98101738 }, { "epoch": 0.062140535133783446, "grad_norm": 0.9907852411270142, "learning_rate": 1.2420262664165103e-05, "loss": 0.9058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 994, "tokens_per_second_per_gpu": 17703.41, "total_tokens": 98200094 }, { "epoch": 0.06220305076269067, "grad_norm": 1.0192914009094238, "learning_rate": 1.243277048155097e-05, "loss": 0.9475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 995, "tokens_per_second_per_gpu": 18359.51, "total_tokens": 98299450 }, { "epoch": 0.0622655663915979, "grad_norm": 1.0105607509613037, "learning_rate": 1.2445278298936835e-05, "loss": 0.9412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 996, "tokens_per_second_per_gpu": 17377.64, "total_tokens": 98395128 }, { "epoch": 0.062328082020505125, "grad_norm": 1.032723069190979, "learning_rate": 1.2457786116322704e-05, "loss": 0.9026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 997, "tokens_per_second_per_gpu": 17403.34, "total_tokens": 98489667 }, { "epoch": 0.062390597649412356, "grad_norm": 0.9980921745300293, "learning_rate": 1.2470293933708567e-05, "loss": 0.9382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 998, "tokens_per_second_per_gpu": 17686.6, "total_tokens": 98587273 }, { "epoch": 0.06245311327831958, "grad_norm": 0.9703108668327332, "learning_rate": 1.2482801751094436e-05, "loss": 0.9213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 999, "tokens_per_second_per_gpu": 17190.51, "total_tokens": 98685335 }, { "epoch": 0.06251562890722681, "grad_norm": 0.9749014973640442, "learning_rate": 1.2495309568480301e-05, "loss": 0.8937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1000, "tokens_per_second_per_gpu": 17986.95, "total_tokens": 98783837 }, { "epoch": 0.06257814453613403, "grad_norm": 0.9957938194274902, "learning_rate": 1.2507817385866168e-05, "loss": 0.913, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1001, "tokens_per_second_per_gpu": 16283.23, "total_tokens": 98878323 }, { "epoch": 0.06264066016504126, "grad_norm": 1.087856411933899, "learning_rate": 1.2520325203252033e-05, "loss": 0.8809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1002, "tokens_per_second_per_gpu": 15994.86, "total_tokens": 98971152 }, { "epoch": 0.06270317579394849, "grad_norm": 0.9990292191505432, "learning_rate": 1.25328330206379e-05, "loss": 0.8913, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1003, "tokens_per_second_per_gpu": 16452.35, "total_tokens": 99071282 }, { "epoch": 0.06276569142285571, "grad_norm": 0.9667826294898987, "learning_rate": 1.2545340838023765e-05, "loss": 0.8889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1004, "tokens_per_second_per_gpu": 17668.66, "total_tokens": 99169715 }, { "epoch": 0.06282820705176294, "grad_norm": 1.0315039157867432, "learning_rate": 1.2557848655409632e-05, "loss": 0.9161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1005, "tokens_per_second_per_gpu": 16511.47, "total_tokens": 99264979 }, { "epoch": 0.06289072268067017, "grad_norm": 1.0060831308364868, "learning_rate": 1.2570356472795497e-05, "loss": 0.8911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1006, "tokens_per_second_per_gpu": 17362.98, "total_tokens": 99362763 }, { "epoch": 0.0629532383095774, "grad_norm": 0.9971218705177307, "learning_rate": 1.2582864290181364e-05, "loss": 0.8947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1007, "tokens_per_second_per_gpu": 15567.18, "total_tokens": 99454401 }, { "epoch": 0.06301575393848462, "grad_norm": 0.9996000528335571, "learning_rate": 1.259537210756723e-05, "loss": 0.9067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1008, "tokens_per_second_per_gpu": 16910.08, "total_tokens": 99548856 }, { "epoch": 0.06307826956739185, "grad_norm": 1.0278842449188232, "learning_rate": 1.2607879924953096e-05, "loss": 0.9254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1009, "tokens_per_second_per_gpu": 16960.03, "total_tokens": 99643645 }, { "epoch": 0.06314078519629908, "grad_norm": 0.9763393998146057, "learning_rate": 1.2620387742338962e-05, "loss": 0.8952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1010, "tokens_per_second_per_gpu": 15913.02, "total_tokens": 99738748 }, { "epoch": 0.0632033008252063, "grad_norm": 0.9668099284172058, "learning_rate": 1.263289555972483e-05, "loss": 0.9124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1011, "tokens_per_second_per_gpu": 17715.49, "total_tokens": 99837330 }, { "epoch": 0.06326581645411353, "grad_norm": 0.9568635821342468, "learning_rate": 1.2645403377110694e-05, "loss": 0.8757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1012, "tokens_per_second_per_gpu": 16585.63, "total_tokens": 99934793 }, { "epoch": 0.06332833208302076, "grad_norm": 1.0008305311203003, "learning_rate": 1.2657911194496562e-05, "loss": 0.9744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1013, "tokens_per_second_per_gpu": 17687.72, "total_tokens": 100035907 }, { "epoch": 0.06339084771192798, "grad_norm": 1.0263934135437012, "learning_rate": 1.2670419011882427e-05, "loss": 0.9275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1014, "tokens_per_second_per_gpu": 16419.74, "total_tokens": 100130272 }, { "epoch": 0.0634533633408352, "grad_norm": 0.9894859194755554, "learning_rate": 1.2682926829268294e-05, "loss": 0.9331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1015, "tokens_per_second_per_gpu": 17391.29, "total_tokens": 100230635 }, { "epoch": 0.06351587896974244, "grad_norm": 0.9784866571426392, "learning_rate": 1.269543464665416e-05, "loss": 0.9308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1016, "tokens_per_second_per_gpu": 16861.25, "total_tokens": 100330097 }, { "epoch": 0.06357839459864967, "grad_norm": 1.0351656675338745, "learning_rate": 1.2707942464040026e-05, "loss": 0.9084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1017, "tokens_per_second_per_gpu": 16617.59, "total_tokens": 100424833 }, { "epoch": 0.06364091022755688, "grad_norm": 1.0293618440628052, "learning_rate": 1.2720450281425892e-05, "loss": 0.8864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1018, "tokens_per_second_per_gpu": 17166.49, "total_tokens": 100518187 }, { "epoch": 0.06370342585646412, "grad_norm": 1.0545486211776733, "learning_rate": 1.2732958098811758e-05, "loss": 0.891, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1019, "tokens_per_second_per_gpu": 16227.43, "total_tokens": 100612185 }, { "epoch": 0.06376594148537135, "grad_norm": 0.985016405582428, "learning_rate": 1.2745465916197624e-05, "loss": 0.9131, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1020, "tokens_per_second_per_gpu": 17213.7, "total_tokens": 100711291 }, { "epoch": 0.06382845711427856, "grad_norm": 0.9982410669326782, "learning_rate": 1.275797373358349e-05, "loss": 0.9068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1021, "tokens_per_second_per_gpu": 17410.11, "total_tokens": 100808442 }, { "epoch": 0.0638909727431858, "grad_norm": 1.004679799079895, "learning_rate": 1.2770481550969356e-05, "loss": 0.8872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1022, "tokens_per_second_per_gpu": 17184.45, "total_tokens": 100904532 }, { "epoch": 0.06395348837209303, "grad_norm": 1.0251320600509644, "learning_rate": 1.2782989368355223e-05, "loss": 1.0163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1023, "tokens_per_second_per_gpu": 18887.0, "total_tokens": 101003949 }, { "epoch": 0.06401600400100026, "grad_norm": 0.9647120237350464, "learning_rate": 1.2795497185741088e-05, "loss": 0.8942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1024, "tokens_per_second_per_gpu": 16891.08, "total_tokens": 101100949 }, { "epoch": 0.06407851962990747, "grad_norm": 0.9840918183326721, "learning_rate": 1.2808005003126955e-05, "loss": 0.9134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1025, "tokens_per_second_per_gpu": 16868.16, "total_tokens": 101197127 }, { "epoch": 0.0641410352588147, "grad_norm": 0.9950463175773621, "learning_rate": 1.2820512820512823e-05, "loss": 0.8735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1026, "tokens_per_second_per_gpu": 16745.14, "total_tokens": 101294542 }, { "epoch": 0.06420355088772194, "grad_norm": 0.990977942943573, "learning_rate": 1.2833020637898688e-05, "loss": 0.8721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1027, "tokens_per_second_per_gpu": 17201.54, "total_tokens": 101390863 }, { "epoch": 0.06426606651662915, "grad_norm": 0.9211655259132385, "learning_rate": 1.2845528455284555e-05, "loss": 0.8653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1028, "tokens_per_second_per_gpu": 16342.11, "total_tokens": 101488207 }, { "epoch": 0.06432858214553638, "grad_norm": 1.0072325468063354, "learning_rate": 1.285803627267042e-05, "loss": 0.8566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1029, "tokens_per_second_per_gpu": 16852.69, "total_tokens": 101582257 }, { "epoch": 0.06439109777444361, "grad_norm": 1.0241621732711792, "learning_rate": 1.2870544090056287e-05, "loss": 0.944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1030, "tokens_per_second_per_gpu": 16045.63, "total_tokens": 101674561 }, { "epoch": 0.06445361340335083, "grad_norm": 1.099469542503357, "learning_rate": 1.2883051907442153e-05, "loss": 0.8735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1031, "tokens_per_second_per_gpu": 16178.09, "total_tokens": 101768153 }, { "epoch": 0.06451612903225806, "grad_norm": 0.9604825973510742, "learning_rate": 1.289555972482802e-05, "loss": 0.926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1032, "tokens_per_second_per_gpu": 17999.27, "total_tokens": 101871064 }, { "epoch": 0.0645786446611653, "grad_norm": 1.0160925388336182, "learning_rate": 1.2908067542213885e-05, "loss": 0.8997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1033, "tokens_per_second_per_gpu": 17335.43, "total_tokens": 101966126 }, { "epoch": 0.06464116029007252, "grad_norm": 0.9836959838867188, "learning_rate": 1.2920575359599752e-05, "loss": 0.8731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1034, "tokens_per_second_per_gpu": 17322.23, "total_tokens": 102062086 }, { "epoch": 0.06470367591897974, "grad_norm": 0.9767029881477356, "learning_rate": 1.2933083176985617e-05, "loss": 0.8785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1035, "tokens_per_second_per_gpu": 16717.71, "total_tokens": 102157683 }, { "epoch": 0.06476619154788697, "grad_norm": 0.9712253212928772, "learning_rate": 1.2945590994371484e-05, "loss": 0.8772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1036, "tokens_per_second_per_gpu": 16877.17, "total_tokens": 102253794 }, { "epoch": 0.0648287071767942, "grad_norm": 1.025564432144165, "learning_rate": 1.2958098811757349e-05, "loss": 0.9448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1037, "tokens_per_second_per_gpu": 15903.83, "total_tokens": 102346442 }, { "epoch": 0.06489122280570142, "grad_norm": 0.9820135831832886, "learning_rate": 1.2970606629143216e-05, "loss": 0.9092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1038, "tokens_per_second_per_gpu": 15712.44, "total_tokens": 102440535 }, { "epoch": 0.06495373843460865, "grad_norm": 1.1009376049041748, "learning_rate": 1.2983114446529081e-05, "loss": 0.9057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1039, "tokens_per_second_per_gpu": 15411.46, "total_tokens": 102531843 }, { "epoch": 0.06501625406351588, "grad_norm": 0.9971907138824463, "learning_rate": 1.299562226391495e-05, "loss": 0.9275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1040, "tokens_per_second_per_gpu": 18408.46, "total_tokens": 102632093 }, { "epoch": 0.0650787696924231, "grad_norm": 1.1284747123718262, "learning_rate": 1.3008130081300815e-05, "loss": 0.885, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1041, "tokens_per_second_per_gpu": 17602.7, "total_tokens": 102731089 }, { "epoch": 0.06514128532133033, "grad_norm": 1.0075997114181519, "learning_rate": 1.3020637898686682e-05, "loss": 0.9411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1042, "tokens_per_second_per_gpu": 18300.65, "total_tokens": 102835388 }, { "epoch": 0.06520380095023756, "grad_norm": 1.0072163343429565, "learning_rate": 1.3033145716072547e-05, "loss": 0.9868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1043, "tokens_per_second_per_gpu": 18597.0, "total_tokens": 102940784 }, { "epoch": 0.06526631657914479, "grad_norm": 0.9809909462928772, "learning_rate": 1.3045653533458414e-05, "loss": 0.8546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1044, "tokens_per_second_per_gpu": 16233.91, "total_tokens": 103037393 }, { "epoch": 0.06532883220805201, "grad_norm": 0.9937800765037537, "learning_rate": 1.3058161350844279e-05, "loss": 0.8506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1045, "tokens_per_second_per_gpu": 17074.56, "total_tokens": 103134843 }, { "epoch": 0.06539134783695924, "grad_norm": 0.9918276071548462, "learning_rate": 1.3070669168230146e-05, "loss": 0.9117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1046, "tokens_per_second_per_gpu": 17570.99, "total_tokens": 103232951 }, { "epoch": 0.06545386346586647, "grad_norm": 1.0205662250518799, "learning_rate": 1.3083176985616011e-05, "loss": 0.906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1047, "tokens_per_second_per_gpu": 16301.67, "total_tokens": 103329247 }, { "epoch": 0.06551637909477369, "grad_norm": 1.0359065532684326, "learning_rate": 1.3095684803001878e-05, "loss": 0.9415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1048, "tokens_per_second_per_gpu": 18889.96, "total_tokens": 103427889 }, { "epoch": 0.06557889472368092, "grad_norm": 1.0405791997909546, "learning_rate": 1.3108192620387743e-05, "loss": 0.9214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1049, "tokens_per_second_per_gpu": 18333.14, "total_tokens": 103526558 }, { "epoch": 0.06564141035258815, "grad_norm": 0.9993111491203308, "learning_rate": 1.312070043777361e-05, "loss": 0.9242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1050, "tokens_per_second_per_gpu": 18732.59, "total_tokens": 103627727 }, { "epoch": 0.06570392598149537, "grad_norm": 1.0468510389328003, "learning_rate": 1.3133208255159475e-05, "loss": 0.9195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1051, "tokens_per_second_per_gpu": 18199.5, "total_tokens": 103726350 }, { "epoch": 0.0657664416104026, "grad_norm": 0.9928292036056519, "learning_rate": 1.3145716072545342e-05, "loss": 0.9338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1052, "tokens_per_second_per_gpu": 18400.52, "total_tokens": 103832451 }, { "epoch": 0.06582895723930983, "grad_norm": 1.127306580543518, "learning_rate": 1.3158223889931207e-05, "loss": 0.9236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1053, "tokens_per_second_per_gpu": 17397.43, "total_tokens": 103931947 }, { "epoch": 0.06589147286821706, "grad_norm": 0.9732884764671326, "learning_rate": 1.3170731707317076e-05, "loss": 0.8283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1054, "tokens_per_second_per_gpu": 17035.17, "total_tokens": 104029296 }, { "epoch": 0.06595398849712428, "grad_norm": 1.0380319356918335, "learning_rate": 1.3183239524702939e-05, "loss": 0.8903, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1055, "tokens_per_second_per_gpu": 16163.83, "total_tokens": 104126708 }, { "epoch": 0.06601650412603151, "grad_norm": 0.9832822680473328, "learning_rate": 1.3195747342088808e-05, "loss": 0.8831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1056, "tokens_per_second_per_gpu": 17405.69, "total_tokens": 104226876 }, { "epoch": 0.06607901975493874, "grad_norm": 1.0310359001159668, "learning_rate": 1.3208255159474673e-05, "loss": 0.9261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1057, "tokens_per_second_per_gpu": 18159.42, "total_tokens": 104326456 }, { "epoch": 0.06614153538384596, "grad_norm": 0.9598349332809448, "learning_rate": 1.322076297686054e-05, "loss": 0.9214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1058, "tokens_per_second_per_gpu": 17985.0, "total_tokens": 104427032 }, { "epoch": 0.06620405101275319, "grad_norm": 0.9802102446556091, "learning_rate": 1.3233270794246405e-05, "loss": 0.9132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1059, "tokens_per_second_per_gpu": 17668.58, "total_tokens": 104529646 }, { "epoch": 0.06626656664166042, "grad_norm": 1.077978253364563, "learning_rate": 1.3245778611632272e-05, "loss": 0.944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1060, "tokens_per_second_per_gpu": 17799.74, "total_tokens": 104633168 }, { "epoch": 0.06632908227056764, "grad_norm": 0.9746183156967163, "learning_rate": 1.3258286429018137e-05, "loss": 0.9138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1061, "tokens_per_second_per_gpu": 18179.62, "total_tokens": 104736550 }, { "epoch": 0.06639159789947487, "grad_norm": 1.041972279548645, "learning_rate": 1.3270794246404004e-05, "loss": 0.9212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1062, "tokens_per_second_per_gpu": 17388.64, "total_tokens": 104836888 }, { "epoch": 0.0664541135283821, "grad_norm": 1.0262415409088135, "learning_rate": 1.3283302063789869e-05, "loss": 0.9091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1063, "tokens_per_second_per_gpu": 16613.28, "total_tokens": 104934566 }, { "epoch": 0.06651662915728933, "grad_norm": 1.0777326822280884, "learning_rate": 1.3295809881175736e-05, "loss": 0.9328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1064, "tokens_per_second_per_gpu": 16215.02, "total_tokens": 105030542 }, { "epoch": 0.06657914478619655, "grad_norm": 0.9622330069541931, "learning_rate": 1.3308317698561601e-05, "loss": 0.9035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1065, "tokens_per_second_per_gpu": 16912.96, "total_tokens": 105129657 }, { "epoch": 0.06664166041510378, "grad_norm": 0.9437965750694275, "learning_rate": 1.3320825515947468e-05, "loss": 0.9118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1066, "tokens_per_second_per_gpu": 18603.72, "total_tokens": 105232162 }, { "epoch": 0.06670417604401101, "grad_norm": 0.9742669463157654, "learning_rate": 1.3333333333333333e-05, "loss": 0.8917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1067, "tokens_per_second_per_gpu": 17715.26, "total_tokens": 105332345 }, { "epoch": 0.06676669167291822, "grad_norm": 0.9400179982185364, "learning_rate": 1.33458411507192e-05, "loss": 0.8875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1068, "tokens_per_second_per_gpu": 17529.12, "total_tokens": 105434219 }, { "epoch": 0.06682920730182546, "grad_norm": 0.9660691618919373, "learning_rate": 1.3358348968105065e-05, "loss": 0.8622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1069, "tokens_per_second_per_gpu": 17581.94, "total_tokens": 105528556 }, { "epoch": 0.06689172293073269, "grad_norm": 1.04258131980896, "learning_rate": 1.3370856785490934e-05, "loss": 0.9709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1070, "tokens_per_second_per_gpu": 17814.94, "total_tokens": 105631193 }, { "epoch": 0.0669542385596399, "grad_norm": 0.9479602575302124, "learning_rate": 1.3383364602876797e-05, "loss": 0.8969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1071, "tokens_per_second_per_gpu": 17747.69, "total_tokens": 105731011 }, { "epoch": 0.06701675418854713, "grad_norm": 1.0215928554534912, "learning_rate": 1.3395872420262666e-05, "loss": 0.9054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1072, "tokens_per_second_per_gpu": 16925.97, "total_tokens": 105829596 }, { "epoch": 0.06707926981745437, "grad_norm": 1.0377427339553833, "learning_rate": 1.3408380237648531e-05, "loss": 0.9026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1073, "tokens_per_second_per_gpu": 17267.9, "total_tokens": 105926356 }, { "epoch": 0.0671417854463616, "grad_norm": 0.9815409779548645, "learning_rate": 1.3420888055034398e-05, "loss": 0.8932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1074, "tokens_per_second_per_gpu": 18089.07, "total_tokens": 106027957 }, { "epoch": 0.06720430107526881, "grad_norm": 0.995633602142334, "learning_rate": 1.3433395872420263e-05, "loss": 0.9521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1075, "tokens_per_second_per_gpu": 17876.35, "total_tokens": 106131225 }, { "epoch": 0.06726681670417604, "grad_norm": 0.9612926244735718, "learning_rate": 1.344590368980613e-05, "loss": 0.8833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1076, "tokens_per_second_per_gpu": 17727.35, "total_tokens": 106232613 }, { "epoch": 0.06732933233308327, "grad_norm": 1.0146723985671997, "learning_rate": 1.3458411507191995e-05, "loss": 0.8878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1077, "tokens_per_second_per_gpu": 16723.0, "total_tokens": 106327339 }, { "epoch": 0.06739184796199049, "grad_norm": 1.00763738155365, "learning_rate": 1.3470919324577862e-05, "loss": 0.8877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1078, "tokens_per_second_per_gpu": 16927.59, "total_tokens": 106423831 }, { "epoch": 0.06745436359089772, "grad_norm": 0.9883391261100769, "learning_rate": 1.3483427141963727e-05, "loss": 0.9546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1079, "tokens_per_second_per_gpu": 18195.5, "total_tokens": 106527875 }, { "epoch": 0.06751687921980495, "grad_norm": 0.9868289232254028, "learning_rate": 1.3495934959349594e-05, "loss": 0.9049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1080, "tokens_per_second_per_gpu": 18472.09, "total_tokens": 106628470 }, { "epoch": 0.06757939484871218, "grad_norm": 0.9740011692047119, "learning_rate": 1.350844277673546e-05, "loss": 0.874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1081, "tokens_per_second_per_gpu": 18182.36, "total_tokens": 106728986 }, { "epoch": 0.0676419104776194, "grad_norm": 0.955884575843811, "learning_rate": 1.3520950594121326e-05, "loss": 0.8832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1082, "tokens_per_second_per_gpu": 18161.03, "total_tokens": 106829502 }, { "epoch": 0.06770442610652663, "grad_norm": 1.0341588258743286, "learning_rate": 1.3533458411507192e-05, "loss": 0.9197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1083, "tokens_per_second_per_gpu": 17886.19, "total_tokens": 106926639 }, { "epoch": 0.06776694173543386, "grad_norm": 1.01216721534729, "learning_rate": 1.354596622889306e-05, "loss": 0.9107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1084, "tokens_per_second_per_gpu": 17304.77, "total_tokens": 107027812 }, { "epoch": 0.06782945736434108, "grad_norm": 1.0179483890533447, "learning_rate": 1.3558474046278924e-05, "loss": 0.9078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1085, "tokens_per_second_per_gpu": 16257.21, "total_tokens": 107124171 }, { "epoch": 0.06789197299324831, "grad_norm": 0.9834089279174805, "learning_rate": 1.3570981863664792e-05, "loss": 0.8873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1086, "tokens_per_second_per_gpu": 17556.96, "total_tokens": 107224048 }, { "epoch": 0.06795448862215554, "grad_norm": 1.038439393043518, "learning_rate": 1.3583489681050657e-05, "loss": 0.9529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1087, "tokens_per_second_per_gpu": 18692.14, "total_tokens": 107329033 }, { "epoch": 0.06801700425106276, "grad_norm": 1.1842259168624878, "learning_rate": 1.3595997498436524e-05, "loss": 0.888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1088, "tokens_per_second_per_gpu": 17088.62, "total_tokens": 107425094 }, { "epoch": 0.06807951987996999, "grad_norm": 1.0151766538619995, "learning_rate": 1.3608505315822391e-05, "loss": 0.8841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1089, "tokens_per_second_per_gpu": 17448.51, "total_tokens": 107521038 }, { "epoch": 0.06814203550887722, "grad_norm": 1.1717125177383423, "learning_rate": 1.3621013133208256e-05, "loss": 0.9279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1090, "tokens_per_second_per_gpu": 16872.74, "total_tokens": 107612757 }, { "epoch": 0.06820455113778445, "grad_norm": 1.0576902627944946, "learning_rate": 1.3633520950594123e-05, "loss": 0.9301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1091, "tokens_per_second_per_gpu": 16779.17, "total_tokens": 107711200 }, { "epoch": 0.06826706676669167, "grad_norm": 0.9920963644981384, "learning_rate": 1.3646028767979988e-05, "loss": 0.8687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1092, "tokens_per_second_per_gpu": 16585.2, "total_tokens": 107806763 }, { "epoch": 0.0683295823955989, "grad_norm": 1.0652390718460083, "learning_rate": 1.3658536585365855e-05, "loss": 0.9127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1093, "tokens_per_second_per_gpu": 16730.85, "total_tokens": 107905896 }, { "epoch": 0.06839209802450613, "grad_norm": 1.1018115282058716, "learning_rate": 1.367104440275172e-05, "loss": 0.8798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1094, "tokens_per_second_per_gpu": 17524.87, "total_tokens": 108003207 }, { "epoch": 0.06845461365341335, "grad_norm": 1.027275800704956, "learning_rate": 1.3683552220137587e-05, "loss": 0.8479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1095, "tokens_per_second_per_gpu": 16893.47, "total_tokens": 108098159 }, { "epoch": 0.06851712928232058, "grad_norm": 1.0563483238220215, "learning_rate": 1.3696060037523453e-05, "loss": 0.9121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1096, "tokens_per_second_per_gpu": 17355.87, "total_tokens": 108197917 }, { "epoch": 0.06857964491122781, "grad_norm": 0.9625040292739868, "learning_rate": 1.3708567854909321e-05, "loss": 0.8702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1097, "tokens_per_second_per_gpu": 18493.79, "total_tokens": 108299621 }, { "epoch": 0.06864216054013503, "grad_norm": 0.9551374912261963, "learning_rate": 1.3721075672295185e-05, "loss": 0.8668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1098, "tokens_per_second_per_gpu": 17564.21, "total_tokens": 108401232 }, { "epoch": 0.06870467616904226, "grad_norm": 0.9373469352722168, "learning_rate": 1.3733583489681053e-05, "loss": 0.9101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1099, "tokens_per_second_per_gpu": 18100.51, "total_tokens": 108504089 }, { "epoch": 0.06876719179794949, "grad_norm": 0.961973249912262, "learning_rate": 1.3746091307066918e-05, "loss": 0.8996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1100, "tokens_per_second_per_gpu": 17912.56, "total_tokens": 108604492 }, { "epoch": 0.06882970742685672, "grad_norm": 1.0313793420791626, "learning_rate": 1.3758599124452785e-05, "loss": 0.8947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1101, "tokens_per_second_per_gpu": 16796.48, "total_tokens": 108701574 }, { "epoch": 0.06889222305576394, "grad_norm": 1.1441997289657593, "learning_rate": 1.377110694183865e-05, "loss": 0.9214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1102, "tokens_per_second_per_gpu": 17730.75, "total_tokens": 108802553 }, { "epoch": 0.06895473868467117, "grad_norm": 0.993577241897583, "learning_rate": 1.3783614759224517e-05, "loss": 0.8806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1103, "tokens_per_second_per_gpu": 17648.42, "total_tokens": 108901730 }, { "epoch": 0.0690172543135784, "grad_norm": 0.9907950758934021, "learning_rate": 1.3796122576610383e-05, "loss": 0.8544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1104, "tokens_per_second_per_gpu": 16642.23, "total_tokens": 108995637 }, { "epoch": 0.06907976994248562, "grad_norm": 1.0216609239578247, "learning_rate": 1.380863039399625e-05, "loss": 0.8703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1105, "tokens_per_second_per_gpu": 16755.99, "total_tokens": 109091525 }, { "epoch": 0.06914228557139285, "grad_norm": 0.9949982762336731, "learning_rate": 1.3821138211382115e-05, "loss": 0.9038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1106, "tokens_per_second_per_gpu": 18020.63, "total_tokens": 109192168 }, { "epoch": 0.06920480120030008, "grad_norm": 1.0360521078109741, "learning_rate": 1.3833646028767982e-05, "loss": 0.8857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1107, "tokens_per_second_per_gpu": 16817.81, "total_tokens": 109286465 }, { "epoch": 0.0692673168292073, "grad_norm": 1.0268720388412476, "learning_rate": 1.3846153846153847e-05, "loss": 0.8591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1108, "tokens_per_second_per_gpu": 16405.3, "total_tokens": 109381442 }, { "epoch": 0.06932983245811453, "grad_norm": 0.9838871955871582, "learning_rate": 1.3858661663539714e-05, "loss": 0.8311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1109, "tokens_per_second_per_gpu": 15235.17, "total_tokens": 109475126 }, { "epoch": 0.06939234808702176, "grad_norm": 0.9745010137557983, "learning_rate": 1.3871169480925579e-05, "loss": 0.8708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1110, "tokens_per_second_per_gpu": 17399.57, "total_tokens": 109573809 }, { "epoch": 0.06945486371592899, "grad_norm": 0.9853121042251587, "learning_rate": 1.3883677298311446e-05, "loss": 0.8932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1111, "tokens_per_second_per_gpu": 11746.06, "total_tokens": 109674731 }, { "epoch": 0.0695173793448362, "grad_norm": 0.9769676923751831, "learning_rate": 1.3896185115697311e-05, "loss": 0.926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1112, "tokens_per_second_per_gpu": 18457.38, "total_tokens": 109776245 }, { "epoch": 0.06957989497374344, "grad_norm": 0.9815648198127747, "learning_rate": 1.390869293308318e-05, "loss": 0.8826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1113, "tokens_per_second_per_gpu": 17419.36, "total_tokens": 109876695 }, { "epoch": 0.06964241060265067, "grad_norm": 0.9951468110084534, "learning_rate": 1.3921200750469045e-05, "loss": 0.9006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1114, "tokens_per_second_per_gpu": 17017.71, "total_tokens": 109976348 }, { "epoch": 0.06970492623155788, "grad_norm": 0.9579470157623291, "learning_rate": 1.3933708567854912e-05, "loss": 0.8684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1115, "tokens_per_second_per_gpu": 16240.48, "total_tokens": 110073840 }, { "epoch": 0.06976744186046512, "grad_norm": 1.0008325576782227, "learning_rate": 1.3946216385240777e-05, "loss": 0.9134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1116, "tokens_per_second_per_gpu": 17598.28, "total_tokens": 110175267 }, { "epoch": 0.06982995748937235, "grad_norm": 1.010765552520752, "learning_rate": 1.3958724202626644e-05, "loss": 0.8966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1117, "tokens_per_second_per_gpu": 18829.47, "total_tokens": 110276280 }, { "epoch": 0.06989247311827956, "grad_norm": 0.9589751362800598, "learning_rate": 1.3971232020012509e-05, "loss": 0.8851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1118, "tokens_per_second_per_gpu": 17215.88, "total_tokens": 110376494 }, { "epoch": 0.0699549887471868, "grad_norm": 0.9895570278167725, "learning_rate": 1.3983739837398376e-05, "loss": 0.8915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1119, "tokens_per_second_per_gpu": 17722.29, "total_tokens": 110473716 }, { "epoch": 0.07001750437609403, "grad_norm": 0.9770158529281616, "learning_rate": 1.3996247654784241e-05, "loss": 0.9012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1120, "tokens_per_second_per_gpu": 18057.9, "total_tokens": 110576687 }, { "epoch": 0.07008002000500126, "grad_norm": 0.9669992327690125, "learning_rate": 1.4008755472170108e-05, "loss": 0.8662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1121, "tokens_per_second_per_gpu": 17927.25, "total_tokens": 110676244 }, { "epoch": 0.07014253563390847, "grad_norm": 0.9767062067985535, "learning_rate": 1.4021263289555973e-05, "loss": 0.8572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1122, "tokens_per_second_per_gpu": 17539.28, "total_tokens": 110773143 }, { "epoch": 0.0702050512628157, "grad_norm": 0.9713623523712158, "learning_rate": 1.403377110694184e-05, "loss": 0.8542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1123, "tokens_per_second_per_gpu": 17211.94, "total_tokens": 110872337 }, { "epoch": 0.07026756689172294, "grad_norm": 0.980077862739563, "learning_rate": 1.4046278924327705e-05, "loss": 0.8851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1124, "tokens_per_second_per_gpu": 17347.23, "total_tokens": 110972510 }, { "epoch": 0.07033008252063015, "grad_norm": 1.0332560539245605, "learning_rate": 1.4058786741713572e-05, "loss": 0.9284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1125, "tokens_per_second_per_gpu": 18284.03, "total_tokens": 111077396 }, { "epoch": 0.07039259814953738, "grad_norm": 1.0129814147949219, "learning_rate": 1.4071294559099437e-05, "loss": 0.9132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1126, "tokens_per_second_per_gpu": 16283.73, "total_tokens": 111176610 }, { "epoch": 0.07045511377844461, "grad_norm": 0.9903469085693359, "learning_rate": 1.4083802376485306e-05, "loss": 0.8848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1127, "tokens_per_second_per_gpu": 17766.53, "total_tokens": 111277277 }, { "epoch": 0.07051762940735183, "grad_norm": 0.9967551827430725, "learning_rate": 1.409631019387117e-05, "loss": 0.8931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1128, "tokens_per_second_per_gpu": 17690.81, "total_tokens": 111376104 }, { "epoch": 0.07058014503625906, "grad_norm": 0.9949086308479309, "learning_rate": 1.4108818011257038e-05, "loss": 0.8797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1129, "tokens_per_second_per_gpu": 17816.74, "total_tokens": 111475837 }, { "epoch": 0.07064266066516629, "grad_norm": 0.978926956653595, "learning_rate": 1.4121325828642903e-05, "loss": 0.906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1130, "tokens_per_second_per_gpu": 17689.22, "total_tokens": 111578625 }, { "epoch": 0.07070517629407352, "grad_norm": 0.9708767533302307, "learning_rate": 1.413383364602877e-05, "loss": 0.8787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1131, "tokens_per_second_per_gpu": 18462.64, "total_tokens": 111678267 }, { "epoch": 0.07076769192298074, "grad_norm": 1.0793795585632324, "learning_rate": 1.4146341463414635e-05, "loss": 0.9158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1132, "tokens_per_second_per_gpu": 17018.21, "total_tokens": 111774357 }, { "epoch": 0.07083020755188797, "grad_norm": 0.9719757437705994, "learning_rate": 1.4158849280800502e-05, "loss": 0.9347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1133, "tokens_per_second_per_gpu": 17277.15, "total_tokens": 111875354 }, { "epoch": 0.0708927231807952, "grad_norm": 1.0787781476974487, "learning_rate": 1.4171357098186367e-05, "loss": 0.8886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1134, "tokens_per_second_per_gpu": 17012.01, "total_tokens": 111972907 }, { "epoch": 0.07095523880970242, "grad_norm": 1.0367075204849243, "learning_rate": 1.4183864915572234e-05, "loss": 0.9001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1135, "tokens_per_second_per_gpu": 16545.07, "total_tokens": 112066232 }, { "epoch": 0.07101775443860965, "grad_norm": 0.9805665612220764, "learning_rate": 1.41963727329581e-05, "loss": 0.873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1136, "tokens_per_second_per_gpu": 17231.49, "total_tokens": 112166915 }, { "epoch": 0.07108027006751688, "grad_norm": 1.015814185142517, "learning_rate": 1.4208880550343966e-05, "loss": 0.9472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1137, "tokens_per_second_per_gpu": 17308.81, "total_tokens": 112268770 }, { "epoch": 0.07114278569642411, "grad_norm": 1.0004925727844238, "learning_rate": 1.4221388367729831e-05, "loss": 0.8136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1138, "tokens_per_second_per_gpu": 16241.36, "total_tokens": 112359921 }, { "epoch": 0.07120530132533133, "grad_norm": 0.9808747172355652, "learning_rate": 1.4233896185115698e-05, "loss": 0.8508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1139, "tokens_per_second_per_gpu": 17643.05, "total_tokens": 112457745 }, { "epoch": 0.07126781695423856, "grad_norm": 0.9936821460723877, "learning_rate": 1.4246404002501563e-05, "loss": 0.8503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1140, "tokens_per_second_per_gpu": 14893.0, "total_tokens": 112549904 }, { "epoch": 0.07133033258314579, "grad_norm": 0.9799278974533081, "learning_rate": 1.425891181988743e-05, "loss": 0.8898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1141, "tokens_per_second_per_gpu": 17008.69, "total_tokens": 112648240 }, { "epoch": 0.07139284821205301, "grad_norm": 0.9849611520767212, "learning_rate": 1.4271419637273295e-05, "loss": 0.8799, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1142, "tokens_per_second_per_gpu": 18387.57, "total_tokens": 112747527 }, { "epoch": 0.07145536384096024, "grad_norm": 1.0293608903884888, "learning_rate": 1.4283927454659164e-05, "loss": 0.879, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1143, "tokens_per_second_per_gpu": 16590.05, "total_tokens": 112842167 }, { "epoch": 0.07151787946986747, "grad_norm": 1.0036038160324097, "learning_rate": 1.4296435272045028e-05, "loss": 0.8953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1144, "tokens_per_second_per_gpu": 17338.02, "total_tokens": 112940565 }, { "epoch": 0.07158039509877469, "grad_norm": 0.9800506234169006, "learning_rate": 1.4308943089430896e-05, "loss": 0.8819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1145, "tokens_per_second_per_gpu": 17677.45, "total_tokens": 113042917 }, { "epoch": 0.07164291072768192, "grad_norm": 1.0113153457641602, "learning_rate": 1.4321450906816761e-05, "loss": 0.8937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1146, "tokens_per_second_per_gpu": 17772.98, "total_tokens": 113140717 }, { "epoch": 0.07170542635658915, "grad_norm": 0.9851714968681335, "learning_rate": 1.4333958724202628e-05, "loss": 0.8694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1147, "tokens_per_second_per_gpu": 16768.2, "total_tokens": 113239169 }, { "epoch": 0.07176794198549638, "grad_norm": 1.0319238901138306, "learning_rate": 1.4346466541588493e-05, "loss": 0.8778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1148, "tokens_per_second_per_gpu": 17201.53, "total_tokens": 113337011 }, { "epoch": 0.0718304576144036, "grad_norm": 1.0275635719299316, "learning_rate": 1.435897435897436e-05, "loss": 0.8976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1149, "tokens_per_second_per_gpu": 17064.76, "total_tokens": 113434520 }, { "epoch": 0.07189297324331083, "grad_norm": 0.9586769342422485, "learning_rate": 1.4371482176360225e-05, "loss": 0.8779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1150, "tokens_per_second_per_gpu": 17129.82, "total_tokens": 113535021 }, { "epoch": 0.07195548887221806, "grad_norm": 1.0193334817886353, "learning_rate": 1.4383989993746092e-05, "loss": 0.9177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1151, "tokens_per_second_per_gpu": 16861.65, "total_tokens": 113635736 }, { "epoch": 0.07201800450112528, "grad_norm": 0.9816683530807495, "learning_rate": 1.4396497811131958e-05, "loss": 0.8931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1152, "tokens_per_second_per_gpu": 16981.99, "total_tokens": 113735762 }, { "epoch": 0.07208052013003251, "grad_norm": 0.9364395141601562, "learning_rate": 1.4409005628517824e-05, "loss": 0.8682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1153, "tokens_per_second_per_gpu": 17351.93, "total_tokens": 113836436 }, { "epoch": 0.07214303575893974, "grad_norm": 0.995496392250061, "learning_rate": 1.4421513445903691e-05, "loss": 0.8938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1154, "tokens_per_second_per_gpu": 17914.96, "total_tokens": 113937884 }, { "epoch": 0.07220555138784696, "grad_norm": 0.9810004830360413, "learning_rate": 1.4434021263289556e-05, "loss": 0.8642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1155, "tokens_per_second_per_gpu": 18435.26, "total_tokens": 114036619 }, { "epoch": 0.07226806701675419, "grad_norm": 1.0405703783035278, "learning_rate": 1.4446529080675425e-05, "loss": 0.918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1156, "tokens_per_second_per_gpu": 17442.0, "total_tokens": 114136527 }, { "epoch": 0.07233058264566142, "grad_norm": 1.0411981344223022, "learning_rate": 1.445903689806129e-05, "loss": 0.8796, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1157, "tokens_per_second_per_gpu": 17381.1, "total_tokens": 114235718 }, { "epoch": 0.07239309827456865, "grad_norm": 0.9782307147979736, "learning_rate": 1.4471544715447157e-05, "loss": 0.8702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1158, "tokens_per_second_per_gpu": 17184.99, "total_tokens": 114334618 }, { "epoch": 0.07245561390347587, "grad_norm": 1.0188692808151245, "learning_rate": 1.4484052532833022e-05, "loss": 0.9064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1159, "tokens_per_second_per_gpu": 16928.83, "total_tokens": 114433197 }, { "epoch": 0.0725181295323831, "grad_norm": 1.006922721862793, "learning_rate": 1.449656035021889e-05, "loss": 0.8961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1160, "tokens_per_second_per_gpu": 17305.8, "total_tokens": 114533032 }, { "epoch": 0.07258064516129033, "grad_norm": 1.184649109840393, "learning_rate": 1.4509068167604754e-05, "loss": 0.9022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1161, "tokens_per_second_per_gpu": 18044.98, "total_tokens": 114630589 }, { "epoch": 0.07264316079019754, "grad_norm": 0.9709172248840332, "learning_rate": 1.4521575984990621e-05, "loss": 0.8679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1162, "tokens_per_second_per_gpu": 16718.22, "total_tokens": 114726681 }, { "epoch": 0.07270567641910478, "grad_norm": 0.9615724086761475, "learning_rate": 1.4534083802376486e-05, "loss": 0.8831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1163, "tokens_per_second_per_gpu": 17068.33, "total_tokens": 114826014 }, { "epoch": 0.072768192048012, "grad_norm": 0.988442599773407, "learning_rate": 1.4546591619762353e-05, "loss": 0.8668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1164, "tokens_per_second_per_gpu": 17631.91, "total_tokens": 114923440 }, { "epoch": 0.07283070767691922, "grad_norm": 0.9496045708656311, "learning_rate": 1.4559099437148219e-05, "loss": 0.8632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1165, "tokens_per_second_per_gpu": 16870.83, "total_tokens": 115025124 }, { "epoch": 0.07289322330582645, "grad_norm": 1.0052803754806519, "learning_rate": 1.4571607254534085e-05, "loss": 0.9114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1166, "tokens_per_second_per_gpu": 18109.64, "total_tokens": 115127166 }, { "epoch": 0.07295573893473369, "grad_norm": 0.9864760637283325, "learning_rate": 1.458411507191995e-05, "loss": 0.8662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1167, "tokens_per_second_per_gpu": 17709.35, "total_tokens": 115227737 }, { "epoch": 0.07301825456364092, "grad_norm": 0.9867616891860962, "learning_rate": 1.4596622889305817e-05, "loss": 0.9005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1168, "tokens_per_second_per_gpu": 15454.85, "total_tokens": 115325249 }, { "epoch": 0.07308077019254813, "grad_norm": 1.0618047714233398, "learning_rate": 1.4609130706691683e-05, "loss": 0.9152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1169, "tokens_per_second_per_gpu": 16967.51, "total_tokens": 115422450 }, { "epoch": 0.07314328582145536, "grad_norm": 1.0082355737686157, "learning_rate": 1.4621638524077551e-05, "loss": 0.8463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1170, "tokens_per_second_per_gpu": 17381.63, "total_tokens": 115523316 }, { "epoch": 0.0732058014503626, "grad_norm": 0.996658444404602, "learning_rate": 1.4634146341463415e-05, "loss": 0.8526, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1171, "tokens_per_second_per_gpu": 17589.17, "total_tokens": 115623009 }, { "epoch": 0.07326831707926981, "grad_norm": 0.9948294758796692, "learning_rate": 1.4646654158849283e-05, "loss": 0.8614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1172, "tokens_per_second_per_gpu": 17637.85, "total_tokens": 115720735 }, { "epoch": 0.07333083270817704, "grad_norm": 0.9407763481140137, "learning_rate": 1.4659161976235149e-05, "loss": 0.8699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1173, "tokens_per_second_per_gpu": 18925.93, "total_tokens": 115821515 }, { "epoch": 0.07339334833708427, "grad_norm": 1.0261191129684448, "learning_rate": 1.4671669793621015e-05, "loss": 0.8569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1174, "tokens_per_second_per_gpu": 16635.57, "total_tokens": 115918207 }, { "epoch": 0.07345586396599149, "grad_norm": 0.9412176609039307, "learning_rate": 1.468417761100688e-05, "loss": 0.8488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1175, "tokens_per_second_per_gpu": 17617.35, "total_tokens": 116016903 }, { "epoch": 0.07351837959489872, "grad_norm": 0.955898642539978, "learning_rate": 1.4696685428392747e-05, "loss": 0.853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1176, "tokens_per_second_per_gpu": 16961.74, "total_tokens": 116116572 }, { "epoch": 0.07358089522380595, "grad_norm": 0.9805734753608704, "learning_rate": 1.4709193245778613e-05, "loss": 0.8779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1177, "tokens_per_second_per_gpu": 17247.6, "total_tokens": 116217591 }, { "epoch": 0.07364341085271318, "grad_norm": 1.1334725618362427, "learning_rate": 1.472170106316448e-05, "loss": 0.848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1178, "tokens_per_second_per_gpu": 14670.8, "total_tokens": 116304971 }, { "epoch": 0.0737059264816204, "grad_norm": 1.039786696434021, "learning_rate": 1.4734208880550345e-05, "loss": 0.8589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1179, "tokens_per_second_per_gpu": 16141.39, "total_tokens": 116400584 }, { "epoch": 0.07376844211052763, "grad_norm": 0.9771906137466431, "learning_rate": 1.4746716697936212e-05, "loss": 0.9443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1180, "tokens_per_second_per_gpu": 17872.18, "total_tokens": 116503787 }, { "epoch": 0.07383095773943486, "grad_norm": 0.988742470741272, "learning_rate": 1.4759224515322077e-05, "loss": 0.8593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1181, "tokens_per_second_per_gpu": 18919.61, "total_tokens": 116607834 }, { "epoch": 0.07389347336834208, "grad_norm": 1.0860908031463623, "learning_rate": 1.4771732332707944e-05, "loss": 0.8756, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1182, "tokens_per_second_per_gpu": 17667.16, "total_tokens": 116707727 }, { "epoch": 0.07395598899724931, "grad_norm": 1.0345852375030518, "learning_rate": 1.4784240150093809e-05, "loss": 0.8537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1183, "tokens_per_second_per_gpu": 18161.57, "total_tokens": 116806537 }, { "epoch": 0.07401850462615654, "grad_norm": 0.9650565385818481, "learning_rate": 1.4796747967479676e-05, "loss": 0.8461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1184, "tokens_per_second_per_gpu": 17721.88, "total_tokens": 116906193 }, { "epoch": 0.07408102025506376, "grad_norm": 1.032217264175415, "learning_rate": 1.4809255784865541e-05, "loss": 0.9228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1185, "tokens_per_second_per_gpu": 18351.79, "total_tokens": 117003629 }, { "epoch": 0.07414353588397099, "grad_norm": 0.9960338473320007, "learning_rate": 1.482176360225141e-05, "loss": 0.9206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1186, "tokens_per_second_per_gpu": 18217.17, "total_tokens": 117106383 }, { "epoch": 0.07420605151287822, "grad_norm": 0.9515200257301331, "learning_rate": 1.4834271419637275e-05, "loss": 0.8476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1187, "tokens_per_second_per_gpu": 18134.46, "total_tokens": 117206113 }, { "epoch": 0.07426856714178545, "grad_norm": 1.0167559385299683, "learning_rate": 1.4846779237023142e-05, "loss": 0.8442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1188, "tokens_per_second_per_gpu": 16812.76, "total_tokens": 117300807 }, { "epoch": 0.07433108277069267, "grad_norm": 1.2257806062698364, "learning_rate": 1.4859287054409007e-05, "loss": 0.8717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1189, "tokens_per_second_per_gpu": 17687.85, "total_tokens": 117398910 }, { "epoch": 0.0743935983995999, "grad_norm": 1.2894424200057983, "learning_rate": 1.4871794871794874e-05, "loss": 0.8402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1190, "tokens_per_second_per_gpu": 17862.9, "total_tokens": 117500167 }, { "epoch": 0.07445611402850713, "grad_norm": 0.9576022028923035, "learning_rate": 1.4884302689180739e-05, "loss": 0.8894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1191, "tokens_per_second_per_gpu": 18008.18, "total_tokens": 117601982 }, { "epoch": 0.07451862965741435, "grad_norm": 0.9663230776786804, "learning_rate": 1.4896810506566606e-05, "loss": 0.8875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1192, "tokens_per_second_per_gpu": 18342.07, "total_tokens": 117701771 }, { "epoch": 0.07458114528632158, "grad_norm": 0.9924978613853455, "learning_rate": 1.4909318323952471e-05, "loss": 0.7877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1193, "tokens_per_second_per_gpu": 15919.46, "total_tokens": 117794454 }, { "epoch": 0.07464366091522881, "grad_norm": 1.0380538702011108, "learning_rate": 1.4921826141338338e-05, "loss": 0.8788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1194, "tokens_per_second_per_gpu": 17025.69, "total_tokens": 117892322 }, { "epoch": 0.07470617654413604, "grad_norm": 1.0626617670059204, "learning_rate": 1.4934333958724203e-05, "loss": 0.8365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1195, "tokens_per_second_per_gpu": 15391.05, "total_tokens": 117985000 }, { "epoch": 0.07476869217304326, "grad_norm": 1.022918701171875, "learning_rate": 1.494684177611007e-05, "loss": 0.8308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1196, "tokens_per_second_per_gpu": 17842.35, "total_tokens": 118080072 }, { "epoch": 0.07483120780195049, "grad_norm": 1.0172051191329956, "learning_rate": 1.4959349593495935e-05, "loss": 0.8482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1197, "tokens_per_second_per_gpu": 16393.86, "total_tokens": 118175541 }, { "epoch": 0.07489372343085772, "grad_norm": 1.0250111818313599, "learning_rate": 1.4971857410881802e-05, "loss": 0.8767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1198, "tokens_per_second_per_gpu": 17119.0, "total_tokens": 118275490 }, { "epoch": 0.07495623905976494, "grad_norm": 1.0021718740463257, "learning_rate": 1.4984365228267667e-05, "loss": 0.9086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1199, "tokens_per_second_per_gpu": 17516.7, "total_tokens": 118375919 }, { "epoch": 0.07501875468867217, "grad_norm": 1.0148261785507202, "learning_rate": 1.4996873045653536e-05, "loss": 0.9248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1200, "tokens_per_second_per_gpu": 17014.97, "total_tokens": 118473396 }, { "epoch": 0.0750812703175794, "grad_norm": 0.9869603514671326, "learning_rate": 1.50093808630394e-05, "loss": 0.8628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1201, "tokens_per_second_per_gpu": 17970.73, "total_tokens": 118574648 }, { "epoch": 0.07514378594648662, "grad_norm": 1.1326217651367188, "learning_rate": 1.5021888680425268e-05, "loss": 0.8955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1202, "tokens_per_second_per_gpu": 18166.21, "total_tokens": 118678647 }, { "epoch": 0.07520630157539385, "grad_norm": 1.0569709539413452, "learning_rate": 1.5034396497811133e-05, "loss": 0.8484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1203, "tokens_per_second_per_gpu": 16686.91, "total_tokens": 118774523 }, { "epoch": 0.07526881720430108, "grad_norm": 0.9851219654083252, "learning_rate": 1.5046904315197e-05, "loss": 0.8115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1204, "tokens_per_second_per_gpu": 18186.7, "total_tokens": 118874806 }, { "epoch": 0.07533133283320831, "grad_norm": 1.016680121421814, "learning_rate": 1.5059412132582865e-05, "loss": 0.8774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1205, "tokens_per_second_per_gpu": 16898.44, "total_tokens": 118969812 }, { "epoch": 0.07539384846211553, "grad_norm": 1.0263841152191162, "learning_rate": 1.5071919949968732e-05, "loss": 0.8385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1206, "tokens_per_second_per_gpu": 16299.57, "total_tokens": 119065758 }, { "epoch": 0.07545636409102276, "grad_norm": 1.0414758920669556, "learning_rate": 1.5084427767354597e-05, "loss": 0.8889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1207, "tokens_per_second_per_gpu": 16195.05, "total_tokens": 119161085 }, { "epoch": 0.07551887971992999, "grad_norm": 0.9923651218414307, "learning_rate": 1.5096935584740464e-05, "loss": 0.8306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1208, "tokens_per_second_per_gpu": 17261.49, "total_tokens": 119258619 }, { "epoch": 0.0755813953488372, "grad_norm": 1.0089277029037476, "learning_rate": 1.510944340212633e-05, "loss": 0.8569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1209, "tokens_per_second_per_gpu": 17018.76, "total_tokens": 119354111 }, { "epoch": 0.07564391097774444, "grad_norm": 1.0115537643432617, "learning_rate": 1.5121951219512196e-05, "loss": 0.8914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1210, "tokens_per_second_per_gpu": 18478.9, "total_tokens": 119452627 }, { "epoch": 0.07570642660665167, "grad_norm": 1.0175286531448364, "learning_rate": 1.5134459036898061e-05, "loss": 0.8788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1211, "tokens_per_second_per_gpu": 16351.25, "total_tokens": 119551308 }, { "epoch": 0.07576894223555888, "grad_norm": 0.9511014819145203, "learning_rate": 1.5146966854283928e-05, "loss": 0.8243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1212, "tokens_per_second_per_gpu": 17270.69, "total_tokens": 119647623 }, { "epoch": 0.07583145786446611, "grad_norm": 0.9186429381370544, "learning_rate": 1.5159474671669793e-05, "loss": 0.8726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1213, "tokens_per_second_per_gpu": 18694.3, "total_tokens": 119752895 }, { "epoch": 0.07589397349337335, "grad_norm": 1.0813924074172974, "learning_rate": 1.517198248905566e-05, "loss": 0.8547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1214, "tokens_per_second_per_gpu": 16918.81, "total_tokens": 119848816 }, { "epoch": 0.07595648912228058, "grad_norm": 1.0976589918136597, "learning_rate": 1.5184490306441525e-05, "loss": 0.8727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1215, "tokens_per_second_per_gpu": 16898.31, "total_tokens": 119945132 }, { "epoch": 0.0760190047511878, "grad_norm": 1.1615877151489258, "learning_rate": 1.5196998123827394e-05, "loss": 0.8868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1216, "tokens_per_second_per_gpu": 17495.58, "total_tokens": 120042916 }, { "epoch": 0.07608152038009502, "grad_norm": 1.0147945880889893, "learning_rate": 1.5209505941213261e-05, "loss": 0.8842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1217, "tokens_per_second_per_gpu": 17379.98, "total_tokens": 120142693 }, { "epoch": 0.07614403600900226, "grad_norm": 1.0420303344726562, "learning_rate": 1.5222013758599126e-05, "loss": 0.9147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1218, "tokens_per_second_per_gpu": 17141.12, "total_tokens": 120240271 }, { "epoch": 0.07620655163790947, "grad_norm": 1.082339882850647, "learning_rate": 1.5234521575984993e-05, "loss": 0.8761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1219, "tokens_per_second_per_gpu": 16477.62, "total_tokens": 120338252 }, { "epoch": 0.0762690672668167, "grad_norm": 1.0398743152618408, "learning_rate": 1.5247029393370858e-05, "loss": 0.8764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1220, "tokens_per_second_per_gpu": 17731.22, "total_tokens": 120439166 }, { "epoch": 0.07633158289572393, "grad_norm": 1.0085147619247437, "learning_rate": 1.5259537210756725e-05, "loss": 0.8255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1221, "tokens_per_second_per_gpu": 17926.37, "total_tokens": 120539730 }, { "epoch": 0.07639409852463115, "grad_norm": 1.037609338760376, "learning_rate": 1.527204502814259e-05, "loss": 0.8131, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1222, "tokens_per_second_per_gpu": 16283.62, "total_tokens": 120633746 }, { "epoch": 0.07645661415353838, "grad_norm": 0.9819731116294861, "learning_rate": 1.528455284552846e-05, "loss": 0.8944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1223, "tokens_per_second_per_gpu": 17220.14, "total_tokens": 120735213 }, { "epoch": 0.07651912978244561, "grad_norm": 0.9880427718162537, "learning_rate": 1.5297060662914324e-05, "loss": 0.8916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1224, "tokens_per_second_per_gpu": 17559.98, "total_tokens": 120835785 }, { "epoch": 0.07658164541135284, "grad_norm": 1.0636508464813232, "learning_rate": 1.530956848030019e-05, "loss": 0.8802, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1225, "tokens_per_second_per_gpu": 17835.54, "total_tokens": 120934023 }, { "epoch": 0.07664416104026006, "grad_norm": 0.974310040473938, "learning_rate": 1.5322076297686054e-05, "loss": 0.8937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1226, "tokens_per_second_per_gpu": 18136.02, "total_tokens": 121038450 }, { "epoch": 0.07670667666916729, "grad_norm": 1.0437086820602417, "learning_rate": 1.5334584115071923e-05, "loss": 0.9206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1227, "tokens_per_second_per_gpu": 18933.7, "total_tokens": 121142398 }, { "epoch": 0.07676919229807452, "grad_norm": 0.9709698557853699, "learning_rate": 1.5347091932457788e-05, "loss": 0.8837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1228, "tokens_per_second_per_gpu": 17705.69, "total_tokens": 121243355 }, { "epoch": 0.07683170792698174, "grad_norm": 0.9951902031898499, "learning_rate": 1.5359599749843653e-05, "loss": 0.8998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1229, "tokens_per_second_per_gpu": 16739.38, "total_tokens": 121340953 }, { "epoch": 0.07689422355588897, "grad_norm": 0.9655812382698059, "learning_rate": 1.537210756722952e-05, "loss": 0.8609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1230, "tokens_per_second_per_gpu": 17279.61, "total_tokens": 121440283 }, { "epoch": 0.0769567391847962, "grad_norm": 0.9630962610244751, "learning_rate": 1.5384615384615387e-05, "loss": 0.8183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1231, "tokens_per_second_per_gpu": 17145.24, "total_tokens": 121537185 }, { "epoch": 0.07701925481370342, "grad_norm": 0.981209397315979, "learning_rate": 1.5397123202001252e-05, "loss": 0.8214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1232, "tokens_per_second_per_gpu": 17154.54, "total_tokens": 121634852 }, { "epoch": 0.07708177044261065, "grad_norm": 0.9438909888267517, "learning_rate": 1.5409631019387118e-05, "loss": 0.8393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1233, "tokens_per_second_per_gpu": 16433.8, "total_tokens": 121733025 }, { "epoch": 0.07714428607151788, "grad_norm": 0.9878749847412109, "learning_rate": 1.5422138836772983e-05, "loss": 0.8633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1234, "tokens_per_second_per_gpu": 17452.83, "total_tokens": 121833258 }, { "epoch": 0.07720680170042511, "grad_norm": 1.1034709215164185, "learning_rate": 1.543464665415885e-05, "loss": 0.8756, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1235, "tokens_per_second_per_gpu": 17272.72, "total_tokens": 121929940 }, { "epoch": 0.07726931732933233, "grad_norm": 1.0424891710281372, "learning_rate": 1.5447154471544717e-05, "loss": 0.8627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1236, "tokens_per_second_per_gpu": 16889.6, "total_tokens": 122026508 }, { "epoch": 0.07733183295823956, "grad_norm": 0.9523679614067078, "learning_rate": 1.5459662288930585e-05, "loss": 0.8111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1237, "tokens_per_second_per_gpu": 17939.24, "total_tokens": 122126838 }, { "epoch": 0.07739434858714679, "grad_norm": 0.9850111603736877, "learning_rate": 1.547217010631645e-05, "loss": 0.8373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1238, "tokens_per_second_per_gpu": 17970.67, "total_tokens": 122226344 }, { "epoch": 0.07745686421605401, "grad_norm": 1.0136868953704834, "learning_rate": 1.5484677923702315e-05, "loss": 0.8553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1239, "tokens_per_second_per_gpu": 17024.01, "total_tokens": 122322310 }, { "epoch": 0.07751937984496124, "grad_norm": 1.0102453231811523, "learning_rate": 1.549718574108818e-05, "loss": 0.918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1240, "tokens_per_second_per_gpu": 16831.84, "total_tokens": 122422399 }, { "epoch": 0.07758189547386847, "grad_norm": 0.9900463223457336, "learning_rate": 1.550969355847405e-05, "loss": 0.8244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1241, "tokens_per_second_per_gpu": 17469.17, "total_tokens": 122522825 }, { "epoch": 0.07764441110277569, "grad_norm": 1.0047115087509155, "learning_rate": 1.5522201375859914e-05, "loss": 0.8409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1242, "tokens_per_second_per_gpu": 18186.4, "total_tokens": 122623386 }, { "epoch": 0.07770692673168292, "grad_norm": 1.0103157758712769, "learning_rate": 1.553470919324578e-05, "loss": 0.8577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1243, "tokens_per_second_per_gpu": 17047.66, "total_tokens": 122722263 }, { "epoch": 0.07776944236059015, "grad_norm": 0.999445915222168, "learning_rate": 1.5547217010631645e-05, "loss": 0.8735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1244, "tokens_per_second_per_gpu": 17970.97, "total_tokens": 122820847 }, { "epoch": 0.07783195798949738, "grad_norm": 1.0072752237319946, "learning_rate": 1.5559724828017513e-05, "loss": 0.8933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1245, "tokens_per_second_per_gpu": 17593.68, "total_tokens": 122921926 }, { "epoch": 0.0778944736184046, "grad_norm": 1.0235307216644287, "learning_rate": 1.557223264540338e-05, "loss": 0.9164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1246, "tokens_per_second_per_gpu": 18059.53, "total_tokens": 123024140 }, { "epoch": 0.07795698924731183, "grad_norm": 0.9547250270843506, "learning_rate": 1.5584740462789244e-05, "loss": 0.8635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1247, "tokens_per_second_per_gpu": 17378.66, "total_tokens": 123123926 }, { "epoch": 0.07801950487621906, "grad_norm": 0.9958335757255554, "learning_rate": 1.559724828017511e-05, "loss": 0.7881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1248, "tokens_per_second_per_gpu": 14820.27, "total_tokens": 123215778 }, { "epoch": 0.07808202050512628, "grad_norm": 1.0034396648406982, "learning_rate": 1.5609756097560978e-05, "loss": 0.8632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1249, "tokens_per_second_per_gpu": 16997.07, "total_tokens": 123315931 }, { "epoch": 0.07814453613403351, "grad_norm": 0.9862202405929565, "learning_rate": 1.5622263914946843e-05, "loss": 0.8948, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1250, "tokens_per_second_per_gpu": 17550.66, "total_tokens": 123415332 }, { "epoch": 0.07820705176294074, "grad_norm": 1.0602853298187256, "learning_rate": 1.563477173233271e-05, "loss": 0.8371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1251, "tokens_per_second_per_gpu": 16454.81, "total_tokens": 123506526 }, { "epoch": 0.07826956739184796, "grad_norm": 0.9792863130569458, "learning_rate": 1.5647279549718573e-05, "loss": 0.8385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1252, "tokens_per_second_per_gpu": 17204.68, "total_tokens": 123603801 }, { "epoch": 0.07833208302075519, "grad_norm": 0.9803183674812317, "learning_rate": 1.5659787367104442e-05, "loss": 0.8408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1253, "tokens_per_second_per_gpu": 18655.94, "total_tokens": 123705257 }, { "epoch": 0.07839459864966242, "grad_norm": 0.9752062559127808, "learning_rate": 1.5672295184490307e-05, "loss": 0.8254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1254, "tokens_per_second_per_gpu": 16523.6, "total_tokens": 123800130 }, { "epoch": 0.07845711427856965, "grad_norm": 1.036531686782837, "learning_rate": 1.5684803001876175e-05, "loss": 0.8888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1255, "tokens_per_second_per_gpu": 18370.72, "total_tokens": 123904646 }, { "epoch": 0.07851962990747686, "grad_norm": 1.0595303773880005, "learning_rate": 1.569731081926204e-05, "loss": 0.8726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1256, "tokens_per_second_per_gpu": 15944.12, "total_tokens": 124001727 }, { "epoch": 0.0785821455363841, "grad_norm": 1.039813756942749, "learning_rate": 1.5709818636647906e-05, "loss": 0.9258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1257, "tokens_per_second_per_gpu": 18795.99, "total_tokens": 124105490 }, { "epoch": 0.07864466116529133, "grad_norm": 0.9522657990455627, "learning_rate": 1.572232645403377e-05, "loss": 0.8296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1258, "tokens_per_second_per_gpu": 17888.71, "total_tokens": 124206088 }, { "epoch": 0.07870717679419854, "grad_norm": 0.9950201511383057, "learning_rate": 1.573483427141964e-05, "loss": 0.8567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1259, "tokens_per_second_per_gpu": 16090.04, "total_tokens": 124303304 }, { "epoch": 0.07876969242310577, "grad_norm": 1.001624584197998, "learning_rate": 1.5747342088805505e-05, "loss": 0.8836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1260, "tokens_per_second_per_gpu": 18297.55, "total_tokens": 124402674 }, { "epoch": 0.078832208052013, "grad_norm": 0.955510139465332, "learning_rate": 1.575984990619137e-05, "loss": 0.8398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1261, "tokens_per_second_per_gpu": 17935.81, "total_tokens": 124505077 }, { "epoch": 0.07889472368092024, "grad_norm": 0.9451510906219482, "learning_rate": 1.5772357723577235e-05, "loss": 0.8453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1262, "tokens_per_second_per_gpu": 17979.39, "total_tokens": 124607473 }, { "epoch": 0.07895723930982745, "grad_norm": 0.9835764169692993, "learning_rate": 1.5784865540963104e-05, "loss": 0.918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1263, "tokens_per_second_per_gpu": 17739.34, "total_tokens": 124707439 }, { "epoch": 0.07901975493873468, "grad_norm": 1.01762056350708, "learning_rate": 1.579737335834897e-05, "loss": 0.8817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1264, "tokens_per_second_per_gpu": 17462.62, "total_tokens": 124808837 }, { "epoch": 0.07908227056764192, "grad_norm": 1.0017390251159668, "learning_rate": 1.5809881175734834e-05, "loss": 0.8404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1265, "tokens_per_second_per_gpu": 15893.48, "total_tokens": 124904062 }, { "epoch": 0.07914478619654913, "grad_norm": 0.9910359978675842, "learning_rate": 1.58223889931207e-05, "loss": 0.8792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1266, "tokens_per_second_per_gpu": 17522.58, "total_tokens": 125002665 }, { "epoch": 0.07920730182545636, "grad_norm": 0.9801254868507385, "learning_rate": 1.5834896810506568e-05, "loss": 0.8377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1267, "tokens_per_second_per_gpu": 17464.54, "total_tokens": 125097249 }, { "epoch": 0.0792698174543636, "grad_norm": 0.9758040904998779, "learning_rate": 1.5847404627892433e-05, "loss": 0.8371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1268, "tokens_per_second_per_gpu": 18188.97, "total_tokens": 125197922 }, { "epoch": 0.07933233308327081, "grad_norm": 1.0314689874649048, "learning_rate": 1.58599124452783e-05, "loss": 0.8729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1269, "tokens_per_second_per_gpu": 16448.37, "total_tokens": 125296187 }, { "epoch": 0.07939484871217804, "grad_norm": 1.0363858938217163, "learning_rate": 1.5872420262664167e-05, "loss": 0.8509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1270, "tokens_per_second_per_gpu": 17234.98, "total_tokens": 125390603 }, { "epoch": 0.07945736434108527, "grad_norm": 0.9765426516532898, "learning_rate": 1.5884928080050032e-05, "loss": 0.838, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1271, "tokens_per_second_per_gpu": 16833.74, "total_tokens": 125489656 }, { "epoch": 0.0795198799699925, "grad_norm": 0.9764242768287659, "learning_rate": 1.5897435897435897e-05, "loss": 0.8672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1272, "tokens_per_second_per_gpu": 16278.22, "total_tokens": 125584954 }, { "epoch": 0.07958239559889972, "grad_norm": 0.9346179366111755, "learning_rate": 1.5909943714821766e-05, "loss": 0.85, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1273, "tokens_per_second_per_gpu": 17790.97, "total_tokens": 125686729 }, { "epoch": 0.07964491122780695, "grad_norm": 0.9730413556098938, "learning_rate": 1.592245153220763e-05, "loss": 0.8356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1274, "tokens_per_second_per_gpu": 17907.05, "total_tokens": 125785695 }, { "epoch": 0.07970742685671418, "grad_norm": 0.9653152823448181, "learning_rate": 1.5934959349593496e-05, "loss": 0.8815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1275, "tokens_per_second_per_gpu": 17436.5, "total_tokens": 125886314 }, { "epoch": 0.0797699424856214, "grad_norm": 0.9776859879493713, "learning_rate": 1.594746716697936e-05, "loss": 0.8398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1276, "tokens_per_second_per_gpu": 18071.42, "total_tokens": 125987005 }, { "epoch": 0.07983245811452863, "grad_norm": 0.9746566414833069, "learning_rate": 1.595997498436523e-05, "loss": 0.839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1277, "tokens_per_second_per_gpu": 17771.51, "total_tokens": 126086678 }, { "epoch": 0.07989497374343586, "grad_norm": 0.973211944103241, "learning_rate": 1.5972482801751095e-05, "loss": 0.8666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1278, "tokens_per_second_per_gpu": 17873.58, "total_tokens": 126185074 }, { "epoch": 0.07995748937234308, "grad_norm": 1.0062364339828491, "learning_rate": 1.598499061913696e-05, "loss": 0.8792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1279, "tokens_per_second_per_gpu": 16146.08, "total_tokens": 126280086 }, { "epoch": 0.08002000500125031, "grad_norm": 0.9764326214790344, "learning_rate": 1.5997498436522826e-05, "loss": 0.8333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1280, "tokens_per_second_per_gpu": 17803.55, "total_tokens": 126377475 }, { "epoch": 0.08008252063015754, "grad_norm": 0.970731258392334, "learning_rate": 1.6010006253908694e-05, "loss": 0.8819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1281, "tokens_per_second_per_gpu": 19000.49, "total_tokens": 126482401 }, { "epoch": 0.08014503625906477, "grad_norm": 1.0003836154937744, "learning_rate": 1.6022514071294563e-05, "loss": 0.8825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1282, "tokens_per_second_per_gpu": 17998.95, "total_tokens": 126583260 }, { "epoch": 0.08020755188797199, "grad_norm": 0.9609677195549011, "learning_rate": 1.6035021888680428e-05, "loss": 0.8641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1283, "tokens_per_second_per_gpu": 16894.64, "total_tokens": 126684575 }, { "epoch": 0.08027006751687922, "grad_norm": 1.0187934637069702, "learning_rate": 1.6047529706066293e-05, "loss": 0.8606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1284, "tokens_per_second_per_gpu": 18356.36, "total_tokens": 126783978 }, { "epoch": 0.08033258314578645, "grad_norm": 0.9678820371627808, "learning_rate": 1.6060037523452158e-05, "loss": 0.8646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1285, "tokens_per_second_per_gpu": 16785.75, "total_tokens": 126883767 }, { "epoch": 0.08039509877469367, "grad_norm": 0.9691622257232666, "learning_rate": 1.6072545340838027e-05, "loss": 0.8644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1286, "tokens_per_second_per_gpu": 17864.34, "total_tokens": 126984631 }, { "epoch": 0.0804576144036009, "grad_norm": 0.9304622411727905, "learning_rate": 1.6085053158223892e-05, "loss": 0.8204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1287, "tokens_per_second_per_gpu": 16277.13, "total_tokens": 127080425 }, { "epoch": 0.08052013003250813, "grad_norm": 0.9470855593681335, "learning_rate": 1.6097560975609757e-05, "loss": 0.8669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1288, "tokens_per_second_per_gpu": 17239.67, "total_tokens": 127181591 }, { "epoch": 0.08058264566141535, "grad_norm": 0.9962979555130005, "learning_rate": 1.6110068792995622e-05, "loss": 0.8455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1289, "tokens_per_second_per_gpu": 17575.82, "total_tokens": 127279367 }, { "epoch": 0.08064516129032258, "grad_norm": 0.9978755712509155, "learning_rate": 1.612257661038149e-05, "loss": 0.8411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1290, "tokens_per_second_per_gpu": 17097.14, "total_tokens": 127377502 }, { "epoch": 0.08070767691922981, "grad_norm": 1.024781346321106, "learning_rate": 1.6135084427767356e-05, "loss": 0.912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1291, "tokens_per_second_per_gpu": 17932.83, "total_tokens": 127481630 }, { "epoch": 0.08077019254813704, "grad_norm": 1.0108319520950317, "learning_rate": 1.614759224515322e-05, "loss": 0.85, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1292, "tokens_per_second_per_gpu": 17199.31, "total_tokens": 127576341 }, { "epoch": 0.08083270817704426, "grad_norm": 0.9994286298751831, "learning_rate": 1.6160100062539087e-05, "loss": 0.8303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1293, "tokens_per_second_per_gpu": 16486.58, "total_tokens": 127674464 }, { "epoch": 0.08089522380595149, "grad_norm": 0.9800280928611755, "learning_rate": 1.6172607879924955e-05, "loss": 0.8358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1294, "tokens_per_second_per_gpu": 17162.95, "total_tokens": 127772009 }, { "epoch": 0.08095773943485872, "grad_norm": 0.9482160210609436, "learning_rate": 1.618511569731082e-05, "loss": 0.8309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1295, "tokens_per_second_per_gpu": 18129.2, "total_tokens": 127871082 }, { "epoch": 0.08102025506376594, "grad_norm": 0.9685713648796082, "learning_rate": 1.619762351469669e-05, "loss": 0.8421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1296, "tokens_per_second_per_gpu": 17429.3, "total_tokens": 127970224 }, { "epoch": 0.08108277069267317, "grad_norm": 1.0107240676879883, "learning_rate": 1.6210131332082554e-05, "loss": 0.8615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1297, "tokens_per_second_per_gpu": 18185.07, "total_tokens": 128071683 }, { "epoch": 0.0811452863215804, "grad_norm": 0.9556005001068115, "learning_rate": 1.622263914946842e-05, "loss": 0.8671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1298, "tokens_per_second_per_gpu": 18527.71, "total_tokens": 128174984 }, { "epoch": 0.08120780195048762, "grad_norm": 0.9779369235038757, "learning_rate": 1.6235146966854285e-05, "loss": 0.8436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1299, "tokens_per_second_per_gpu": 16924.2, "total_tokens": 128273507 }, { "epoch": 0.08127031757939485, "grad_norm": 1.0096960067749023, "learning_rate": 1.6247654784240153e-05, "loss": 0.861, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1300, "tokens_per_second_per_gpu": 17867.46, "total_tokens": 128372795 }, { "epoch": 0.08133283320830208, "grad_norm": 1.011788010597229, "learning_rate": 1.6260162601626018e-05, "loss": 0.8134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1301, "tokens_per_second_per_gpu": 16844.16, "total_tokens": 128475340 }, { "epoch": 0.08139534883720931, "grad_norm": 1.0977610349655151, "learning_rate": 1.6272670419011883e-05, "loss": 0.9168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1302, "tokens_per_second_per_gpu": 17950.15, "total_tokens": 128576980 }, { "epoch": 0.08145786446611653, "grad_norm": 0.9740389585494995, "learning_rate": 1.628517823639775e-05, "loss": 0.8774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1303, "tokens_per_second_per_gpu": 18055.56, "total_tokens": 128679242 }, { "epoch": 0.08152038009502376, "grad_norm": 1.0211927890777588, "learning_rate": 1.6297686053783617e-05, "loss": 0.8756, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1304, "tokens_per_second_per_gpu": 17355.2, "total_tokens": 128777759 }, { "epoch": 0.08158289572393099, "grad_norm": 1.0229408740997314, "learning_rate": 1.6310193871169482e-05, "loss": 0.8268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1305, "tokens_per_second_per_gpu": 17992.54, "total_tokens": 128881119 }, { "epoch": 0.0816454113528382, "grad_norm": 1.0573906898498535, "learning_rate": 1.6322701688555348e-05, "loss": 0.8507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1306, "tokens_per_second_per_gpu": 17800.21, "total_tokens": 128977750 }, { "epoch": 0.08170792698174544, "grad_norm": 0.982500433921814, "learning_rate": 1.6335209505941213e-05, "loss": 0.8957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1307, "tokens_per_second_per_gpu": 18449.65, "total_tokens": 129081965 }, { "epoch": 0.08177044261065267, "grad_norm": 0.922448456287384, "learning_rate": 1.634771732332708e-05, "loss": 0.8304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1308, "tokens_per_second_per_gpu": 17616.5, "total_tokens": 129181903 }, { "epoch": 0.08183295823955988, "grad_norm": 1.0505974292755127, "learning_rate": 1.6360225140712947e-05, "loss": 0.8764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1309, "tokens_per_second_per_gpu": 17099.46, "total_tokens": 129279048 }, { "epoch": 0.08189547386846711, "grad_norm": 1.151983380317688, "learning_rate": 1.6372732958098815e-05, "loss": 0.8337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1310, "tokens_per_second_per_gpu": 16493.3, "total_tokens": 129370767 }, { "epoch": 0.08195798949737435, "grad_norm": 1.018183708190918, "learning_rate": 1.638524077548468e-05, "loss": 0.8159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1311, "tokens_per_second_per_gpu": 16889.84, "total_tokens": 129469168 }, { "epoch": 0.08202050512628158, "grad_norm": 0.9824730157852173, "learning_rate": 1.6397748592870546e-05, "loss": 0.8502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1312, "tokens_per_second_per_gpu": 16808.01, "total_tokens": 129568977 }, { "epoch": 0.08208302075518879, "grad_norm": 1.0023713111877441, "learning_rate": 1.641025641025641e-05, "loss": 0.7942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1313, "tokens_per_second_per_gpu": 15605.33, "total_tokens": 129662089 }, { "epoch": 0.08214553638409602, "grad_norm": 1.0267705917358398, "learning_rate": 1.642276422764228e-05, "loss": 0.8705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1314, "tokens_per_second_per_gpu": 17230.47, "total_tokens": 129760891 }, { "epoch": 0.08220805201300325, "grad_norm": 0.9709930419921875, "learning_rate": 1.6435272045028144e-05, "loss": 0.8319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1315, "tokens_per_second_per_gpu": 16731.72, "total_tokens": 129857211 }, { "epoch": 0.08227056764191047, "grad_norm": 0.9959014654159546, "learning_rate": 1.644777986241401e-05, "loss": 0.9073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1316, "tokens_per_second_per_gpu": 17730.04, "total_tokens": 129956220 }, { "epoch": 0.0823330832708177, "grad_norm": 0.9723799824714661, "learning_rate": 1.6460287679799875e-05, "loss": 0.823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1317, "tokens_per_second_per_gpu": 16438.69, "total_tokens": 130053157 }, { "epoch": 0.08239559889972493, "grad_norm": 0.9898772239685059, "learning_rate": 1.6472795497185743e-05, "loss": 0.8438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1318, "tokens_per_second_per_gpu": 17728.34, "total_tokens": 130154247 }, { "epoch": 0.08245811452863216, "grad_norm": 0.9866734743118286, "learning_rate": 1.648530331457161e-05, "loss": 0.836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1319, "tokens_per_second_per_gpu": 16910.05, "total_tokens": 130254609 }, { "epoch": 0.08252063015753938, "grad_norm": 1.0018854141235352, "learning_rate": 1.6497811131957474e-05, "loss": 0.8725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1320, "tokens_per_second_per_gpu": 16874.67, "total_tokens": 130352288 }, { "epoch": 0.08258314578644661, "grad_norm": 1.0204178094863892, "learning_rate": 1.651031894934334e-05, "loss": 0.8914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1321, "tokens_per_second_per_gpu": 18245.88, "total_tokens": 130452521 }, { "epoch": 0.08264566141535384, "grad_norm": 0.985517144203186, "learning_rate": 1.6522826766729208e-05, "loss": 0.8485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1322, "tokens_per_second_per_gpu": 17916.33, "total_tokens": 130553830 }, { "epoch": 0.08270817704426106, "grad_norm": 0.9727475643157959, "learning_rate": 1.6535334584115073e-05, "loss": 0.854, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1323, "tokens_per_second_per_gpu": 17428.42, "total_tokens": 130650278 }, { "epoch": 0.08277069267316829, "grad_norm": 0.922747790813446, "learning_rate": 1.654784240150094e-05, "loss": 0.8287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1324, "tokens_per_second_per_gpu": 16614.41, "total_tokens": 130747527 }, { "epoch": 0.08283320830207552, "grad_norm": 0.9563944935798645, "learning_rate": 1.6560350218886803e-05, "loss": 0.8699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1325, "tokens_per_second_per_gpu": 16705.62, "total_tokens": 130845156 }, { "epoch": 0.08289572393098274, "grad_norm": 0.9686769843101501, "learning_rate": 1.6572858036272672e-05, "loss": 0.8548, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1326, "tokens_per_second_per_gpu": 18420.12, "total_tokens": 130946296 }, { "epoch": 0.08295823955988997, "grad_norm": 0.9752834439277649, "learning_rate": 1.6585365853658537e-05, "loss": 0.878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1327, "tokens_per_second_per_gpu": 18061.28, "total_tokens": 131048867 }, { "epoch": 0.0830207551887972, "grad_norm": 0.9774545431137085, "learning_rate": 1.6597873671044406e-05, "loss": 0.8604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1328, "tokens_per_second_per_gpu": 17697.53, "total_tokens": 131150798 }, { "epoch": 0.08308327081770443, "grad_norm": 0.9547738432884216, "learning_rate": 1.661038148843027e-05, "loss": 0.8469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1329, "tokens_per_second_per_gpu": 17954.79, "total_tokens": 131249504 }, { "epoch": 0.08314578644661165, "grad_norm": 1.0233150720596313, "learning_rate": 1.6622889305816136e-05, "loss": 0.8418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1330, "tokens_per_second_per_gpu": 15825.34, "total_tokens": 131343690 }, { "epoch": 0.08320830207551888, "grad_norm": 0.981158435344696, "learning_rate": 1.6635397123202e-05, "loss": 0.8354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1331, "tokens_per_second_per_gpu": 18410.9, "total_tokens": 131445753 }, { "epoch": 0.08327081770442611, "grad_norm": 1.0140544176101685, "learning_rate": 1.664790494058787e-05, "loss": 0.8335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1332, "tokens_per_second_per_gpu": 17061.65, "total_tokens": 131545437 }, { "epoch": 0.08333333333333333, "grad_norm": 0.955407440662384, "learning_rate": 1.6660412757973735e-05, "loss": 0.8228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1333, "tokens_per_second_per_gpu": 17801.34, "total_tokens": 131645002 }, { "epoch": 0.08339584896224056, "grad_norm": 1.0326261520385742, "learning_rate": 1.66729205753596e-05, "loss": 0.8389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1334, "tokens_per_second_per_gpu": 18064.17, "total_tokens": 131745257 }, { "epoch": 0.08345836459114779, "grad_norm": 1.0405043363571167, "learning_rate": 1.6685428392745465e-05, "loss": 0.8325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1335, "tokens_per_second_per_gpu": 17511.23, "total_tokens": 131842169 }, { "epoch": 0.08352088022005501, "grad_norm": 1.0354281663894653, "learning_rate": 1.6697936210131334e-05, "loss": 0.8204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1336, "tokens_per_second_per_gpu": 15898.76, "total_tokens": 131937973 }, { "epoch": 0.08358339584896224, "grad_norm": 0.9809563159942627, "learning_rate": 1.67104440275172e-05, "loss": 0.8348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1337, "tokens_per_second_per_gpu": 16749.65, "total_tokens": 132036609 }, { "epoch": 0.08364591147786947, "grad_norm": 0.9783992767333984, "learning_rate": 1.6722951844903064e-05, "loss": 0.846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1338, "tokens_per_second_per_gpu": 17493.31, "total_tokens": 132135046 }, { "epoch": 0.0837084271067767, "grad_norm": 1.072164535522461, "learning_rate": 1.673545966228893e-05, "loss": 0.8482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1339, "tokens_per_second_per_gpu": 17389.16, "total_tokens": 132233109 }, { "epoch": 0.08377094273568392, "grad_norm": 0.9575210809707642, "learning_rate": 1.6747967479674798e-05, "loss": 0.8376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1340, "tokens_per_second_per_gpu": 17336.62, "total_tokens": 132331532 }, { "epoch": 0.08383345836459115, "grad_norm": 1.0059796571731567, "learning_rate": 1.6760475297060663e-05, "loss": 0.8516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1341, "tokens_per_second_per_gpu": 17786.71, "total_tokens": 132434680 }, { "epoch": 0.08389597399349838, "grad_norm": 1.0651289224624634, "learning_rate": 1.6772983114446532e-05, "loss": 0.8394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1342, "tokens_per_second_per_gpu": 17849.81, "total_tokens": 132533645 }, { "epoch": 0.0839584896224056, "grad_norm": 1.009333848953247, "learning_rate": 1.6785490931832397e-05, "loss": 0.8441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1343, "tokens_per_second_per_gpu": 18083.11, "total_tokens": 132631271 }, { "epoch": 0.08402100525131283, "grad_norm": 0.9900709986686707, "learning_rate": 1.6797998749218262e-05, "loss": 0.8838, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1344, "tokens_per_second_per_gpu": 17667.57, "total_tokens": 132729087 }, { "epoch": 0.08408352088022006, "grad_norm": 0.9863498210906982, "learning_rate": 1.681050656660413e-05, "loss": 0.8233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1345, "tokens_per_second_per_gpu": 17645.48, "total_tokens": 132828213 }, { "epoch": 0.08414603650912728, "grad_norm": 1.002219796180725, "learning_rate": 1.6823014383989996e-05, "loss": 0.8577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1346, "tokens_per_second_per_gpu": 17117.17, "total_tokens": 132926495 }, { "epoch": 0.0842085521380345, "grad_norm": 0.9891634583473206, "learning_rate": 1.683552220137586e-05, "loss": 0.868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1347, "tokens_per_second_per_gpu": 18354.07, "total_tokens": 133025849 }, { "epoch": 0.08427106776694174, "grad_norm": 0.9848018884658813, "learning_rate": 1.6848030018761726e-05, "loss": 0.8189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1348, "tokens_per_second_per_gpu": 17413.97, "total_tokens": 133121963 }, { "epoch": 0.08433358339584897, "grad_norm": 1.0086358785629272, "learning_rate": 1.6860537836147595e-05, "loss": 0.8503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1349, "tokens_per_second_per_gpu": 17123.34, "total_tokens": 133221042 }, { "epoch": 0.08439609902475619, "grad_norm": 1.0416098833084106, "learning_rate": 1.687304565353346e-05, "loss": 0.867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1350, "tokens_per_second_per_gpu": 16649.19, "total_tokens": 133314355 }, { "epoch": 0.08445861465366342, "grad_norm": 1.020100712776184, "learning_rate": 1.688555347091933e-05, "loss": 0.8623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1351, "tokens_per_second_per_gpu": 17495.28, "total_tokens": 133414054 }, { "epoch": 0.08452113028257065, "grad_norm": 0.9689168930053711, "learning_rate": 1.689806128830519e-05, "loss": 0.8307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1352, "tokens_per_second_per_gpu": 16952.25, "total_tokens": 133514192 }, { "epoch": 0.08458364591147786, "grad_norm": 1.002724051475525, "learning_rate": 1.691056910569106e-05, "loss": 0.8286, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1353, "tokens_per_second_per_gpu": 16430.63, "total_tokens": 133609944 }, { "epoch": 0.0846461615403851, "grad_norm": 1.0075627565383911, "learning_rate": 1.6923076923076924e-05, "loss": 0.8776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1354, "tokens_per_second_per_gpu": 18295.08, "total_tokens": 133710302 }, { "epoch": 0.08470867716929233, "grad_norm": 1.1243411302566528, "learning_rate": 1.6935584740462793e-05, "loss": 0.8252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1355, "tokens_per_second_per_gpu": 16036.37, "total_tokens": 133805395 }, { "epoch": 0.08477119279819954, "grad_norm": 0.9866685271263123, "learning_rate": 1.6948092557848658e-05, "loss": 0.8232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1356, "tokens_per_second_per_gpu": 18514.49, "total_tokens": 133906557 }, { "epoch": 0.08483370842710677, "grad_norm": 0.9493312835693359, "learning_rate": 1.6960600375234523e-05, "loss": 0.8676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1357, "tokens_per_second_per_gpu": 18029.85, "total_tokens": 134008264 }, { "epoch": 0.084896224056014, "grad_norm": 0.9625911116600037, "learning_rate": 1.697310819262039e-05, "loss": 0.8342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1358, "tokens_per_second_per_gpu": 18071.58, "total_tokens": 134108522 }, { "epoch": 0.08495873968492124, "grad_norm": 0.965522825717926, "learning_rate": 1.6985616010006257e-05, "loss": 0.8339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1359, "tokens_per_second_per_gpu": 16548.94, "total_tokens": 134206240 }, { "epoch": 0.08502125531382845, "grad_norm": 1.02810800075531, "learning_rate": 1.6998123827392122e-05, "loss": 0.8717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1360, "tokens_per_second_per_gpu": 16967.08, "total_tokens": 134300926 }, { "epoch": 0.08508377094273568, "grad_norm": 0.9538962244987488, "learning_rate": 1.7010631644777987e-05, "loss": 0.8147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1361, "tokens_per_second_per_gpu": 16942.21, "total_tokens": 134396573 }, { "epoch": 0.08514628657164292, "grad_norm": 0.9521826505661011, "learning_rate": 1.7023139462163853e-05, "loss": 0.8405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1362, "tokens_per_second_per_gpu": 17987.94, "total_tokens": 134495632 }, { "epoch": 0.08520880220055013, "grad_norm": 1.016546607017517, "learning_rate": 1.703564727954972e-05, "loss": 0.87, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1363, "tokens_per_second_per_gpu": 17367.08, "total_tokens": 134591517 }, { "epoch": 0.08527131782945736, "grad_norm": 0.9768506288528442, "learning_rate": 1.7048155096935586e-05, "loss": 0.8515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1364, "tokens_per_second_per_gpu": 17720.09, "total_tokens": 134694610 }, { "epoch": 0.0853338334583646, "grad_norm": 0.9699006080627441, "learning_rate": 1.706066291432145e-05, "loss": 0.8718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1365, "tokens_per_second_per_gpu": 16889.72, "total_tokens": 134794458 }, { "epoch": 0.08539634908727181, "grad_norm": 1.0232561826705933, "learning_rate": 1.7073170731707317e-05, "loss": 0.8935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1366, "tokens_per_second_per_gpu": 17463.17, "total_tokens": 134893508 }, { "epoch": 0.08545886471617904, "grad_norm": 0.9175140261650085, "learning_rate": 1.7085678549093185e-05, "loss": 0.8019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1367, "tokens_per_second_per_gpu": 17580.77, "total_tokens": 134992647 }, { "epoch": 0.08552138034508627, "grad_norm": 0.9612630605697632, "learning_rate": 1.709818636647905e-05, "loss": 0.805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1368, "tokens_per_second_per_gpu": 17018.25, "total_tokens": 135089068 }, { "epoch": 0.0855838959739935, "grad_norm": 0.9928236603736877, "learning_rate": 1.711069418386492e-05, "loss": 0.849, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1369, "tokens_per_second_per_gpu": 16448.96, "total_tokens": 135186220 }, { "epoch": 0.08564641160290072, "grad_norm": 0.9916746020317078, "learning_rate": 1.7123202001250784e-05, "loss": 0.838, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1370, "tokens_per_second_per_gpu": 16806.48, "total_tokens": 135284128 }, { "epoch": 0.08570892723180795, "grad_norm": 0.997134804725647, "learning_rate": 1.713570981863665e-05, "loss": 0.8155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1371, "tokens_per_second_per_gpu": 16951.68, "total_tokens": 135383793 }, { "epoch": 0.08577144286071518, "grad_norm": 1.0571202039718628, "learning_rate": 1.7148217636022515e-05, "loss": 0.8534, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1372, "tokens_per_second_per_gpu": 16141.45, "total_tokens": 135480399 }, { "epoch": 0.0858339584896224, "grad_norm": 0.957426905632019, "learning_rate": 1.7160725453408383e-05, "loss": 0.8058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1373, "tokens_per_second_per_gpu": 17627.92, "total_tokens": 135580421 }, { "epoch": 0.08589647411852963, "grad_norm": 0.939639151096344, "learning_rate": 1.717323327079425e-05, "loss": 0.7676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1374, "tokens_per_second_per_gpu": 15952.26, "total_tokens": 135676804 }, { "epoch": 0.08595898974743686, "grad_norm": 0.9990182518959045, "learning_rate": 1.7185741088180114e-05, "loss": 0.8076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1375, "tokens_per_second_per_gpu": 16630.2, "total_tokens": 135774252 }, { "epoch": 0.08602150537634409, "grad_norm": 1.0538272857666016, "learning_rate": 1.719824890556598e-05, "loss": 0.8252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1376, "tokens_per_second_per_gpu": 16499.47, "total_tokens": 135868031 }, { "epoch": 0.08608402100525131, "grad_norm": 1.0239237546920776, "learning_rate": 1.7210756722951847e-05, "loss": 0.8891, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1377, "tokens_per_second_per_gpu": 16659.77, "total_tokens": 135966797 }, { "epoch": 0.08614653663415854, "grad_norm": 0.9647334218025208, "learning_rate": 1.7223264540337712e-05, "loss": 0.8671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1378, "tokens_per_second_per_gpu": 18066.7, "total_tokens": 136068728 }, { "epoch": 0.08620905226306577, "grad_norm": 0.983814001083374, "learning_rate": 1.7235772357723578e-05, "loss": 0.8264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1379, "tokens_per_second_per_gpu": 16750.57, "total_tokens": 136164638 }, { "epoch": 0.08627156789197299, "grad_norm": 0.9645885825157166, "learning_rate": 1.7248280175109443e-05, "loss": 0.8266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1380, "tokens_per_second_per_gpu": 18461.04, "total_tokens": 136261943 }, { "epoch": 0.08633408352088022, "grad_norm": 0.9380490183830261, "learning_rate": 1.726078799249531e-05, "loss": 0.8121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1381, "tokens_per_second_per_gpu": 16759.68, "total_tokens": 136358479 }, { "epoch": 0.08639659914978745, "grad_norm": 0.9488956928253174, "learning_rate": 1.7273295809881177e-05, "loss": 0.8101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1382, "tokens_per_second_per_gpu": 18718.63, "total_tokens": 136460099 }, { "epoch": 0.08645911477869467, "grad_norm": 0.9520459771156311, "learning_rate": 1.7285803627267045e-05, "loss": 0.8238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1383, "tokens_per_second_per_gpu": 17957.45, "total_tokens": 136562569 }, { "epoch": 0.0865216304076019, "grad_norm": 1.011961817741394, "learning_rate": 1.729831144465291e-05, "loss": 0.867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1384, "tokens_per_second_per_gpu": 18101.15, "total_tokens": 136663839 }, { "epoch": 0.08658414603650913, "grad_norm": 0.9794410467147827, "learning_rate": 1.7310819262038776e-05, "loss": 0.8068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1385, "tokens_per_second_per_gpu": 17898.58, "total_tokens": 136763350 }, { "epoch": 0.08664666166541636, "grad_norm": 1.0039727687835693, "learning_rate": 1.732332707942464e-05, "loss": 0.8295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1386, "tokens_per_second_per_gpu": 16543.63, "total_tokens": 136860569 }, { "epoch": 0.08670917729432358, "grad_norm": 1.0908515453338623, "learning_rate": 1.733583489681051e-05, "loss": 0.8397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1387, "tokens_per_second_per_gpu": 16943.68, "total_tokens": 136955426 }, { "epoch": 0.08677169292323081, "grad_norm": 0.9596145153045654, "learning_rate": 1.7348342714196375e-05, "loss": 0.8186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1388, "tokens_per_second_per_gpu": 17089.25, "total_tokens": 137055512 }, { "epoch": 0.08683420855213804, "grad_norm": 0.9653152227401733, "learning_rate": 1.736085053158224e-05, "loss": 0.859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1389, "tokens_per_second_per_gpu": 17065.74, "total_tokens": 137155213 }, { "epoch": 0.08689672418104526, "grad_norm": 0.9556533694267273, "learning_rate": 1.7373358348968105e-05, "loss": 0.8063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1390, "tokens_per_second_per_gpu": 16574.48, "total_tokens": 137252054 }, { "epoch": 0.08695923980995249, "grad_norm": 1.002769112586975, "learning_rate": 1.7385866166353974e-05, "loss": 0.8161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1391, "tokens_per_second_per_gpu": 17266.05, "total_tokens": 137351027 }, { "epoch": 0.08702175543885972, "grad_norm": 0.9357550144195557, "learning_rate": 1.739837398373984e-05, "loss": 0.8157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1392, "tokens_per_second_per_gpu": 17783.34, "total_tokens": 137450287 }, { "epoch": 0.08708427106776694, "grad_norm": 1.0250160694122314, "learning_rate": 1.7410881801125704e-05, "loss": 0.8067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1393, "tokens_per_second_per_gpu": 16173.29, "total_tokens": 137542230 }, { "epoch": 0.08714678669667417, "grad_norm": 0.9639648795127869, "learning_rate": 1.742338961851157e-05, "loss": 0.8423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1394, "tokens_per_second_per_gpu": 18580.41, "total_tokens": 137647111 }, { "epoch": 0.0872093023255814, "grad_norm": 0.9573659896850586, "learning_rate": 1.7435897435897438e-05, "loss": 0.8329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1395, "tokens_per_second_per_gpu": 17545.4, "total_tokens": 137747293 }, { "epoch": 0.08727181795448863, "grad_norm": 0.9769881367683411, "learning_rate": 1.7448405253283303e-05, "loss": 0.7845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1396, "tokens_per_second_per_gpu": 16844.04, "total_tokens": 137845355 }, { "epoch": 0.08733433358339585, "grad_norm": 0.9830997586250305, "learning_rate": 1.746091307066917e-05, "loss": 0.8252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1397, "tokens_per_second_per_gpu": 16838.01, "total_tokens": 137944292 }, { "epoch": 0.08739684921230308, "grad_norm": 0.957428514957428, "learning_rate": 1.7473420888055033e-05, "loss": 0.7997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1398, "tokens_per_second_per_gpu": 17103.32, "total_tokens": 138040053 }, { "epoch": 0.08745936484121031, "grad_norm": 1.0100868940353394, "learning_rate": 1.7485928705440902e-05, "loss": 0.8726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1399, "tokens_per_second_per_gpu": 16062.09, "total_tokens": 138133374 }, { "epoch": 0.08752188047011752, "grad_norm": 0.9665644764900208, "learning_rate": 1.7498436522826767e-05, "loss": 0.8465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1400, "tokens_per_second_per_gpu": 17896.84, "total_tokens": 138231655 }, { "epoch": 0.08758439609902476, "grad_norm": 0.9926999807357788, "learning_rate": 1.7510944340212636e-05, "loss": 0.8625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1401, "tokens_per_second_per_gpu": 16878.04, "total_tokens": 138333768 }, { "epoch": 0.08764691172793199, "grad_norm": 1.0282636880874634, "learning_rate": 1.75234521575985e-05, "loss": 0.8504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1402, "tokens_per_second_per_gpu": 18129.73, "total_tokens": 138434023 }, { "epoch": 0.0877094273568392, "grad_norm": 0.9701619148254395, "learning_rate": 1.7535959974984366e-05, "loss": 0.8083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1403, "tokens_per_second_per_gpu": 17593.65, "total_tokens": 138535491 }, { "epoch": 0.08777194298574643, "grad_norm": 0.9670308828353882, "learning_rate": 1.754846779237023e-05, "loss": 0.7979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1404, "tokens_per_second_per_gpu": 17346.88, "total_tokens": 138632970 }, { "epoch": 0.08783445861465367, "grad_norm": 0.9826593995094299, "learning_rate": 1.75609756097561e-05, "loss": 0.8591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1405, "tokens_per_second_per_gpu": 18869.52, "total_tokens": 138733077 }, { "epoch": 0.0878969742435609, "grad_norm": 0.9714495539665222, "learning_rate": 1.7573483427141965e-05, "loss": 0.8688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1406, "tokens_per_second_per_gpu": 17900.16, "total_tokens": 138833445 }, { "epoch": 0.08795948987246811, "grad_norm": 1.0004438161849976, "learning_rate": 1.758599124452783e-05, "loss": 0.8711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1407, "tokens_per_second_per_gpu": 17554.61, "total_tokens": 138935930 }, { "epoch": 0.08802200550137534, "grad_norm": 0.9728232622146606, "learning_rate": 1.7598499061913695e-05, "loss": 0.8631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1408, "tokens_per_second_per_gpu": 17553.42, "total_tokens": 139038561 }, { "epoch": 0.08808452113028258, "grad_norm": 1.07818603515625, "learning_rate": 1.7611006879299564e-05, "loss": 0.8088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1409, "tokens_per_second_per_gpu": 17190.53, "total_tokens": 139132409 }, { "epoch": 0.08814703675918979, "grad_norm": 0.9396911263465881, "learning_rate": 1.7623514696685432e-05, "loss": 0.8938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1410, "tokens_per_second_per_gpu": 17898.49, "total_tokens": 139237512 }, { "epoch": 0.08820955238809702, "grad_norm": 0.9962442517280579, "learning_rate": 1.7636022514071294e-05, "loss": 0.8417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1411, "tokens_per_second_per_gpu": 16214.94, "total_tokens": 139335829 }, { "epoch": 0.08827206801700425, "grad_norm": 0.9199680685997009, "learning_rate": 1.7648530331457163e-05, "loss": 0.7916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1412, "tokens_per_second_per_gpu": 18088.69, "total_tokens": 139436565 }, { "epoch": 0.08833458364591147, "grad_norm": 0.9706300497055054, "learning_rate": 1.7661038148843028e-05, "loss": 0.8417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1413, "tokens_per_second_per_gpu": 18159.91, "total_tokens": 139537455 }, { "epoch": 0.0883970992748187, "grad_norm": 0.9610962867736816, "learning_rate": 1.7673545966228897e-05, "loss": 0.8468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1414, "tokens_per_second_per_gpu": 17144.74, "total_tokens": 139636128 }, { "epoch": 0.08845961490372593, "grad_norm": 0.9583474397659302, "learning_rate": 1.7686053783614762e-05, "loss": 0.8231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1415, "tokens_per_second_per_gpu": 17564.02, "total_tokens": 139735335 }, { "epoch": 0.08852213053263316, "grad_norm": 0.980315625667572, "learning_rate": 1.7698561601000627e-05, "loss": 0.8085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1416, "tokens_per_second_per_gpu": 17372.44, "total_tokens": 139832569 }, { "epoch": 0.08858464616154038, "grad_norm": 0.9911327362060547, "learning_rate": 1.7711069418386492e-05, "loss": 0.86, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1417, "tokens_per_second_per_gpu": 16157.39, "total_tokens": 139931866 }, { "epoch": 0.08864716179044761, "grad_norm": 0.9455156326293945, "learning_rate": 1.772357723577236e-05, "loss": 0.8179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1418, "tokens_per_second_per_gpu": 17495.29, "total_tokens": 140029227 }, { "epoch": 0.08870967741935484, "grad_norm": 0.9954439401626587, "learning_rate": 1.7736085053158226e-05, "loss": 0.882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1419, "tokens_per_second_per_gpu": 16943.95, "total_tokens": 140129865 }, { "epoch": 0.08877219304826206, "grad_norm": 0.9670276045799255, "learning_rate": 1.774859287054409e-05, "loss": 0.8621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1420, "tokens_per_second_per_gpu": 18123.46, "total_tokens": 140232765 }, { "epoch": 0.08883470867716929, "grad_norm": 0.9882900714874268, "learning_rate": 1.7761100687929956e-05, "loss": 0.8542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1421, "tokens_per_second_per_gpu": 17742.77, "total_tokens": 140330214 }, { "epoch": 0.08889722430607652, "grad_norm": 0.9520305395126343, "learning_rate": 1.7773608505315825e-05, "loss": 0.8892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1422, "tokens_per_second_per_gpu": 16640.25, "total_tokens": 140429555 }, { "epoch": 0.08895973993498374, "grad_norm": 0.9231758713722229, "learning_rate": 1.778611632270169e-05, "loss": 0.8344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1423, "tokens_per_second_per_gpu": 18561.13, "total_tokens": 140533145 }, { "epoch": 0.08902225556389097, "grad_norm": 0.9779260754585266, "learning_rate": 1.779862414008756e-05, "loss": 0.8601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1424, "tokens_per_second_per_gpu": 18220.89, "total_tokens": 140633542 }, { "epoch": 0.0890847711927982, "grad_norm": 0.9913742542266846, "learning_rate": 1.781113195747342e-05, "loss": 0.791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1425, "tokens_per_second_per_gpu": 16116.72, "total_tokens": 140727703 }, { "epoch": 0.08914728682170543, "grad_norm": 0.9581534266471863, "learning_rate": 1.782363977485929e-05, "loss": 0.8256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1426, "tokens_per_second_per_gpu": 17658.47, "total_tokens": 140825339 }, { "epoch": 0.08920980245061265, "grad_norm": 0.9680148363113403, "learning_rate": 1.7836147592245154e-05, "loss": 0.786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1427, "tokens_per_second_per_gpu": 16551.18, "total_tokens": 140922657 }, { "epoch": 0.08927231807951988, "grad_norm": 0.930088996887207, "learning_rate": 1.7848655409631023e-05, "loss": 0.8425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1428, "tokens_per_second_per_gpu": 17394.94, "total_tokens": 141023975 }, { "epoch": 0.08933483370842711, "grad_norm": 0.9624522924423218, "learning_rate": 1.7861163227016888e-05, "loss": 0.87, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1429, "tokens_per_second_per_gpu": 18060.6, "total_tokens": 141123601 }, { "epoch": 0.08939734933733433, "grad_norm": 0.955414891242981, "learning_rate": 1.7873671044402753e-05, "loss": 0.8297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1430, "tokens_per_second_per_gpu": 17396.86, "total_tokens": 141222496 }, { "epoch": 0.08945986496624156, "grad_norm": 0.9717230796813965, "learning_rate": 1.788617886178862e-05, "loss": 0.8431, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1431, "tokens_per_second_per_gpu": 16217.21, "total_tokens": 141317552 }, { "epoch": 0.08952238059514879, "grad_norm": 0.9478108286857605, "learning_rate": 1.7898686679174487e-05, "loss": 0.7954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1432, "tokens_per_second_per_gpu": 16107.18, "total_tokens": 141412519 }, { "epoch": 0.08958489622405602, "grad_norm": 1.0218435525894165, "learning_rate": 1.7911194496560352e-05, "loss": 0.8424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1433, "tokens_per_second_per_gpu": 17605.37, "total_tokens": 141510313 }, { "epoch": 0.08964741185296324, "grad_norm": 0.9864197969436646, "learning_rate": 1.7923702313946217e-05, "loss": 0.8749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1434, "tokens_per_second_per_gpu": 17933.87, "total_tokens": 141612700 }, { "epoch": 0.08970992748187047, "grad_norm": 1.074425458908081, "learning_rate": 1.7936210131332083e-05, "loss": 0.8148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1435, "tokens_per_second_per_gpu": 16564.57, "total_tokens": 141707755 }, { "epoch": 0.0897724431107777, "grad_norm": 1.0264513492584229, "learning_rate": 1.794871794871795e-05, "loss": 0.8208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1436, "tokens_per_second_per_gpu": 16731.2, "total_tokens": 141805686 }, { "epoch": 0.08983495873968492, "grad_norm": 1.0013082027435303, "learning_rate": 1.7961225766103816e-05, "loss": 0.8559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1437, "tokens_per_second_per_gpu": 17791.69, "total_tokens": 141905874 }, { "epoch": 0.08989747436859215, "grad_norm": 0.9776994585990906, "learning_rate": 1.797373358348968e-05, "loss": 0.8049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1438, "tokens_per_second_per_gpu": 17329.31, "total_tokens": 142003851 }, { "epoch": 0.08995998999749938, "grad_norm": 1.0095961093902588, "learning_rate": 1.7986241400875547e-05, "loss": 0.8441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1439, "tokens_per_second_per_gpu": 17876.87, "total_tokens": 142104568 }, { "epoch": 0.0900225056264066, "grad_norm": 0.9812846779823303, "learning_rate": 1.7998749218261415e-05, "loss": 0.7764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1440, "tokens_per_second_per_gpu": 16972.21, "total_tokens": 142194856 }, { "epoch": 0.09008502125531383, "grad_norm": 0.9891849756240845, "learning_rate": 1.801125703564728e-05, "loss": 0.837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1441, "tokens_per_second_per_gpu": 17176.17, "total_tokens": 142292594 }, { "epoch": 0.09014753688422106, "grad_norm": 0.932945191860199, "learning_rate": 1.802376485303315e-05, "loss": 0.8248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1442, "tokens_per_second_per_gpu": 17829.57, "total_tokens": 142390267 }, { "epoch": 0.09021005251312829, "grad_norm": 1.0222023725509644, "learning_rate": 1.8036272670419014e-05, "loss": 0.8118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1443, "tokens_per_second_per_gpu": 17773.15, "total_tokens": 142487471 }, { "epoch": 0.0902725681420355, "grad_norm": 1.0022029876708984, "learning_rate": 1.804878048780488e-05, "loss": 0.8268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1444, "tokens_per_second_per_gpu": 17593.6, "total_tokens": 142589656 }, { "epoch": 0.09033508377094274, "grad_norm": 0.9971532821655273, "learning_rate": 1.8061288305190745e-05, "loss": 0.8512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1445, "tokens_per_second_per_gpu": 16941.16, "total_tokens": 142688192 }, { "epoch": 0.09039759939984997, "grad_norm": 1.0275427103042603, "learning_rate": 1.8073796122576613e-05, "loss": 0.8342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1446, "tokens_per_second_per_gpu": 17359.54, "total_tokens": 142785584 }, { "epoch": 0.09046011502875718, "grad_norm": 1.013643503189087, "learning_rate": 1.808630393996248e-05, "loss": 0.854, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1447, "tokens_per_second_per_gpu": 17045.06, "total_tokens": 142885048 }, { "epoch": 0.09052263065766442, "grad_norm": 0.9547833800315857, "learning_rate": 1.8098811757348344e-05, "loss": 0.7656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1448, "tokens_per_second_per_gpu": 17159.93, "total_tokens": 142981841 }, { "epoch": 0.09058514628657165, "grad_norm": 0.9607055187225342, "learning_rate": 1.811131957473421e-05, "loss": 0.8245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1449, "tokens_per_second_per_gpu": 16967.77, "total_tokens": 143080438 }, { "epoch": 0.09064766191547886, "grad_norm": 0.9770767688751221, "learning_rate": 1.8123827392120077e-05, "loss": 0.8096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1450, "tokens_per_second_per_gpu": 17545.58, "total_tokens": 143177928 }, { "epoch": 0.0907101775443861, "grad_norm": 0.994426965713501, "learning_rate": 1.8136335209505943e-05, "loss": 0.8395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1451, "tokens_per_second_per_gpu": 17964.94, "total_tokens": 143280008 }, { "epoch": 0.09077269317329333, "grad_norm": 0.9894771575927734, "learning_rate": 1.8148843026891808e-05, "loss": 0.8873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1452, "tokens_per_second_per_gpu": 17194.67, "total_tokens": 143378642 }, { "epoch": 0.09083520880220056, "grad_norm": 0.9607563018798828, "learning_rate": 1.8161350844277673e-05, "loss": 0.8314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1453, "tokens_per_second_per_gpu": 17885.78, "total_tokens": 143478805 }, { "epoch": 0.09089772443110777, "grad_norm": 0.9751229882240295, "learning_rate": 1.817385866166354e-05, "loss": 0.8393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1454, "tokens_per_second_per_gpu": 16663.25, "total_tokens": 143578817 }, { "epoch": 0.090960240060015, "grad_norm": 1.0155445337295532, "learning_rate": 1.8186366479049407e-05, "loss": 0.8416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1455, "tokens_per_second_per_gpu": 17748.1, "total_tokens": 143675756 }, { "epoch": 0.09102275568892224, "grad_norm": 0.9799195528030396, "learning_rate": 1.8198874296435275e-05, "loss": 0.8173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1456, "tokens_per_second_per_gpu": 17890.93, "total_tokens": 143779132 }, { "epoch": 0.09108527131782945, "grad_norm": 1.0124993324279785, "learning_rate": 1.821138211382114e-05, "loss": 0.8457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1457, "tokens_per_second_per_gpu": 17760.87, "total_tokens": 143877119 }, { "epoch": 0.09114778694673668, "grad_norm": 0.9771429300308228, "learning_rate": 1.8223889931207006e-05, "loss": 0.8254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1458, "tokens_per_second_per_gpu": 18158.99, "total_tokens": 143980984 }, { "epoch": 0.09121030257564391, "grad_norm": 0.9357606768608093, "learning_rate": 1.823639774859287e-05, "loss": 0.7935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1459, "tokens_per_second_per_gpu": 17577.5, "total_tokens": 144077172 }, { "epoch": 0.09127281820455113, "grad_norm": 0.9745304584503174, "learning_rate": 1.824890556597874e-05, "loss": 0.8478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1460, "tokens_per_second_per_gpu": 19092.49, "total_tokens": 144179402 }, { "epoch": 0.09133533383345836, "grad_norm": 0.9933697581291199, "learning_rate": 1.8261413383364605e-05, "loss": 0.8209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1461, "tokens_per_second_per_gpu": 17874.9, "total_tokens": 144280823 }, { "epoch": 0.0913978494623656, "grad_norm": 0.9196396470069885, "learning_rate": 1.827392120075047e-05, "loss": 0.7944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1462, "tokens_per_second_per_gpu": 17864.11, "total_tokens": 144382552 }, { "epoch": 0.09146036509127282, "grad_norm": 1.022716999053955, "learning_rate": 1.8286429018136335e-05, "loss": 0.8839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1463, "tokens_per_second_per_gpu": 18426.15, "total_tokens": 144487944 }, { "epoch": 0.09152288072018004, "grad_norm": 0.9379662275314331, "learning_rate": 1.8298936835522204e-05, "loss": 0.7949, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1464, "tokens_per_second_per_gpu": 17456.44, "total_tokens": 144589107 }, { "epoch": 0.09158539634908727, "grad_norm": 1.055497407913208, "learning_rate": 1.831144465290807e-05, "loss": 0.8151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1465, "tokens_per_second_per_gpu": 17195.48, "total_tokens": 144684138 }, { "epoch": 0.0916479119779945, "grad_norm": 0.9528425931930542, "learning_rate": 1.8323952470293934e-05, "loss": 0.844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1466, "tokens_per_second_per_gpu": 18602.24, "total_tokens": 144787316 }, { "epoch": 0.09171042760690172, "grad_norm": 0.9335885047912598, "learning_rate": 1.83364602876798e-05, "loss": 0.8836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1467, "tokens_per_second_per_gpu": 18636.54, "total_tokens": 144890643 }, { "epoch": 0.09177294323580895, "grad_norm": 0.9699863195419312, "learning_rate": 1.8348968105065668e-05, "loss": 0.8045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1468, "tokens_per_second_per_gpu": 15278.35, "total_tokens": 144983088 }, { "epoch": 0.09183545886471618, "grad_norm": 0.9622240662574768, "learning_rate": 1.8361475922451533e-05, "loss": 0.7881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1469, "tokens_per_second_per_gpu": 17235.4, "total_tokens": 145080350 }, { "epoch": 0.0918979744936234, "grad_norm": 0.9419960975646973, "learning_rate": 1.83739837398374e-05, "loss": 0.8213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1470, "tokens_per_second_per_gpu": 17009.19, "total_tokens": 145178370 }, { "epoch": 0.09196049012253063, "grad_norm": 0.9788525104522705, "learning_rate": 1.8386491557223263e-05, "loss": 0.8386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1471, "tokens_per_second_per_gpu": 17853.43, "total_tokens": 145277479 }, { "epoch": 0.09202300575143786, "grad_norm": 0.9911312460899353, "learning_rate": 1.8398999374609132e-05, "loss": 0.8258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1472, "tokens_per_second_per_gpu": 17305.82, "total_tokens": 145373208 }, { "epoch": 0.09208552138034509, "grad_norm": 0.9784603118896484, "learning_rate": 1.8411507191995e-05, "loss": 0.8564, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1473, "tokens_per_second_per_gpu": 17113.39, "total_tokens": 145471851 }, { "epoch": 0.09214803700925231, "grad_norm": 0.9567956924438477, "learning_rate": 1.8424015009380866e-05, "loss": 0.763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1474, "tokens_per_second_per_gpu": 17429.6, "total_tokens": 145571308 }, { "epoch": 0.09221055263815954, "grad_norm": 1.0262746810913086, "learning_rate": 1.843652282676673e-05, "loss": 0.8471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1475, "tokens_per_second_per_gpu": 16779.81, "total_tokens": 145666751 }, { "epoch": 0.09227306826706677, "grad_norm": 1.0203872919082642, "learning_rate": 1.8449030644152596e-05, "loss": 0.8623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1476, "tokens_per_second_per_gpu": 16924.49, "total_tokens": 145762074 }, { "epoch": 0.09233558389597399, "grad_norm": 0.9635571241378784, "learning_rate": 1.8461538461538465e-05, "loss": 0.8433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1477, "tokens_per_second_per_gpu": 17675.87, "total_tokens": 145859388 }, { "epoch": 0.09239809952488122, "grad_norm": 1.0029189586639404, "learning_rate": 1.847404627892433e-05, "loss": 0.8519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1478, "tokens_per_second_per_gpu": 16033.11, "total_tokens": 145954285 }, { "epoch": 0.09246061515378845, "grad_norm": 1.0145248174667358, "learning_rate": 1.8486554096310195e-05, "loss": 0.8034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1479, "tokens_per_second_per_gpu": 16902.66, "total_tokens": 146052051 }, { "epoch": 0.09252313078269567, "grad_norm": 1.0284416675567627, "learning_rate": 1.849906191369606e-05, "loss": 0.8394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1480, "tokens_per_second_per_gpu": 17378.03, "total_tokens": 146154889 }, { "epoch": 0.0925856464116029, "grad_norm": 0.956176221370697, "learning_rate": 1.851156973108193e-05, "loss": 0.8201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1481, "tokens_per_second_per_gpu": 17324.96, "total_tokens": 146252453 }, { "epoch": 0.09264816204051013, "grad_norm": 0.9687169194221497, "learning_rate": 1.8524077548467794e-05, "loss": 0.8573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1482, "tokens_per_second_per_gpu": 18231.18, "total_tokens": 146356901 }, { "epoch": 0.09271067766941736, "grad_norm": 1.0204463005065918, "learning_rate": 1.8536585365853663e-05, "loss": 0.8613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1483, "tokens_per_second_per_gpu": 17430.04, "total_tokens": 146453968 }, { "epoch": 0.09277319329832458, "grad_norm": 0.9591604471206665, "learning_rate": 1.8549093183239524e-05, "loss": 0.8206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1484, "tokens_per_second_per_gpu": 16411.21, "total_tokens": 146553888 }, { "epoch": 0.09283570892723181, "grad_norm": 0.9657684564590454, "learning_rate": 1.8561601000625393e-05, "loss": 0.7997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1485, "tokens_per_second_per_gpu": 17253.75, "total_tokens": 146650215 }, { "epoch": 0.09289822455613904, "grad_norm": 0.9646934866905212, "learning_rate": 1.8574108818011258e-05, "loss": 0.837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1486, "tokens_per_second_per_gpu": 18079.76, "total_tokens": 146751199 }, { "epoch": 0.09296074018504626, "grad_norm": 1.0395512580871582, "learning_rate": 1.8586616635397127e-05, "loss": 0.8097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1487, "tokens_per_second_per_gpu": 17502.73, "total_tokens": 146849182 }, { "epoch": 0.09302325581395349, "grad_norm": 1.0479496717453003, "learning_rate": 1.8599124452782992e-05, "loss": 0.8499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1488, "tokens_per_second_per_gpu": 17372.98, "total_tokens": 146945630 }, { "epoch": 0.09308577144286072, "grad_norm": 0.9670098423957825, "learning_rate": 1.8611632270168857e-05, "loss": 0.8111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1489, "tokens_per_second_per_gpu": 17560.71, "total_tokens": 147044561 }, { "epoch": 0.09314828707176794, "grad_norm": 0.9756729006767273, "learning_rate": 1.8624140087554722e-05, "loss": 0.8401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1490, "tokens_per_second_per_gpu": 18218.88, "total_tokens": 147148488 }, { "epoch": 0.09321080270067517, "grad_norm": 0.9932339787483215, "learning_rate": 1.863664790494059e-05, "loss": 0.8143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1491, "tokens_per_second_per_gpu": 17363.12, "total_tokens": 147246396 }, { "epoch": 0.0932733183295824, "grad_norm": 0.9918854236602783, "learning_rate": 1.8649155722326456e-05, "loss": 0.8719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1492, "tokens_per_second_per_gpu": 17078.4, "total_tokens": 147346606 }, { "epoch": 0.09333583395848963, "grad_norm": 0.9561173915863037, "learning_rate": 1.866166353971232e-05, "loss": 0.8225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1493, "tokens_per_second_per_gpu": 17366.47, "total_tokens": 147445168 }, { "epoch": 0.09339834958739685, "grad_norm": 1.039458155632019, "learning_rate": 1.8674171357098186e-05, "loss": 0.8812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1494, "tokens_per_second_per_gpu": 18640.22, "total_tokens": 147548629 }, { "epoch": 0.09346086521630408, "grad_norm": 0.9912002682685852, "learning_rate": 1.8686679174484055e-05, "loss": 0.8359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1495, "tokens_per_second_per_gpu": 18449.25, "total_tokens": 147654534 }, { "epoch": 0.0935233808452113, "grad_norm": 0.9734987616539001, "learning_rate": 1.869918699186992e-05, "loss": 0.8421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1496, "tokens_per_second_per_gpu": 17732.93, "total_tokens": 147756405 }, { "epoch": 0.09358589647411852, "grad_norm": 0.9504369497299194, "learning_rate": 1.871169480925579e-05, "loss": 0.8238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1497, "tokens_per_second_per_gpu": 16967.44, "total_tokens": 147855788 }, { "epoch": 0.09364841210302575, "grad_norm": 0.995909571647644, "learning_rate": 1.872420262664165e-05, "loss": 0.8346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1498, "tokens_per_second_per_gpu": 16800.19, "total_tokens": 147953207 }, { "epoch": 0.09371092773193299, "grad_norm": 0.9443106055259705, "learning_rate": 1.873671044402752e-05, "loss": 0.8548, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1499, "tokens_per_second_per_gpu": 18318.98, "total_tokens": 148056834 }, { "epoch": 0.09377344336084022, "grad_norm": 1.0622934103012085, "learning_rate": 1.8749218261413384e-05, "loss": 0.8344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1500, "tokens_per_second_per_gpu": 17596.87, "total_tokens": 148155402 }, { "epoch": 0.09383595898974743, "grad_norm": 1.0372298955917358, "learning_rate": 1.8761726078799253e-05, "loss": 0.8482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1501, "tokens_per_second_per_gpu": 18398.01, "total_tokens": 148256622 }, { "epoch": 0.09389847461865466, "grad_norm": 1.0127416849136353, "learning_rate": 1.8774233896185118e-05, "loss": 0.8631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1502, "tokens_per_second_per_gpu": 17445.84, "total_tokens": 148355001 }, { "epoch": 0.0939609902475619, "grad_norm": 1.0378305912017822, "learning_rate": 1.8786741713570983e-05, "loss": 0.8195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1503, "tokens_per_second_per_gpu": 18027.22, "total_tokens": 148455303 }, { "epoch": 0.09402350587646911, "grad_norm": 0.996759295463562, "learning_rate": 1.879924953095685e-05, "loss": 0.8154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1504, "tokens_per_second_per_gpu": 18164.09, "total_tokens": 148551515 }, { "epoch": 0.09408602150537634, "grad_norm": 0.961785078048706, "learning_rate": 1.8811757348342717e-05, "loss": 0.8953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1505, "tokens_per_second_per_gpu": 17765.1, "total_tokens": 148651157 }, { "epoch": 0.09414853713428357, "grad_norm": 0.9511022567749023, "learning_rate": 1.8824265165728582e-05, "loss": 0.7935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1506, "tokens_per_second_per_gpu": 17694.09, "total_tokens": 148749744 }, { "epoch": 0.09421105276319079, "grad_norm": 0.9964797496795654, "learning_rate": 1.8836772983114447e-05, "loss": 0.8455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1507, "tokens_per_second_per_gpu": 18271.11, "total_tokens": 148854420 }, { "epoch": 0.09427356839209802, "grad_norm": 1.00594162940979, "learning_rate": 1.8849280800500313e-05, "loss": 0.8418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1508, "tokens_per_second_per_gpu": 17374.58, "total_tokens": 148953676 }, { "epoch": 0.09433608402100525, "grad_norm": 0.9683927297592163, "learning_rate": 1.886178861788618e-05, "loss": 0.8535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1509, "tokens_per_second_per_gpu": 17264.22, "total_tokens": 149052074 }, { "epoch": 0.09439859964991248, "grad_norm": 0.985188901424408, "learning_rate": 1.8874296435272046e-05, "loss": 0.8127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1510, "tokens_per_second_per_gpu": 17613.4, "total_tokens": 149151834 }, { "epoch": 0.0944611152788197, "grad_norm": 0.9997313022613525, "learning_rate": 1.888680425265791e-05, "loss": 0.8242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1511, "tokens_per_second_per_gpu": 16345.16, "total_tokens": 149246891 }, { "epoch": 0.09452363090772693, "grad_norm": 1.0034979581832886, "learning_rate": 1.8899312070043777e-05, "loss": 0.7955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1512, "tokens_per_second_per_gpu": 17481.83, "total_tokens": 149345869 }, { "epoch": 0.09458614653663416, "grad_norm": 0.9917998313903809, "learning_rate": 1.8911819887429645e-05, "loss": 0.8312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1513, "tokens_per_second_per_gpu": 17707.97, "total_tokens": 149446656 }, { "epoch": 0.09464866216554138, "grad_norm": 1.0060020685195923, "learning_rate": 1.892432770481551e-05, "loss": 0.7811, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1514, "tokens_per_second_per_gpu": 17083.39, "total_tokens": 149542160 }, { "epoch": 0.09471117779444861, "grad_norm": 0.970781683921814, "learning_rate": 1.893683552220138e-05, "loss": 0.798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1515, "tokens_per_second_per_gpu": 17461.75, "total_tokens": 149642357 }, { "epoch": 0.09477369342335584, "grad_norm": 0.9609295725822449, "learning_rate": 1.8949343339587244e-05, "loss": 0.8069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1516, "tokens_per_second_per_gpu": 17631.2, "total_tokens": 149743391 }, { "epoch": 0.09483620905226306, "grad_norm": 1.0767734050750732, "learning_rate": 1.896185115697311e-05, "loss": 0.7896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1517, "tokens_per_second_per_gpu": 17831.43, "total_tokens": 149843367 }, { "epoch": 0.09489872468117029, "grad_norm": 1.0043554306030273, "learning_rate": 1.8974358974358975e-05, "loss": 0.7934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1518, "tokens_per_second_per_gpu": 16699.98, "total_tokens": 149937761 }, { "epoch": 0.09496124031007752, "grad_norm": 1.0095475912094116, "learning_rate": 1.8986866791744843e-05, "loss": 0.8129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1519, "tokens_per_second_per_gpu": 15524.82, "total_tokens": 150033241 }, { "epoch": 0.09502375593898475, "grad_norm": 0.9425175786018372, "learning_rate": 1.899937460913071e-05, "loss": 0.8122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1520, "tokens_per_second_per_gpu": 17341.79, "total_tokens": 150135910 }, { "epoch": 0.09508627156789197, "grad_norm": 0.99261474609375, "learning_rate": 1.9011882426516574e-05, "loss": 0.806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1521, "tokens_per_second_per_gpu": 17081.39, "total_tokens": 150234082 }, { "epoch": 0.0951487871967992, "grad_norm": 1.1285958290100098, "learning_rate": 1.902439024390244e-05, "loss": 0.8187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1522, "tokens_per_second_per_gpu": 16842.86, "total_tokens": 150331536 }, { "epoch": 0.09521130282570643, "grad_norm": 0.9222580194473267, "learning_rate": 1.9036898061288307e-05, "loss": 0.7877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1523, "tokens_per_second_per_gpu": 18071.99, "total_tokens": 150431406 }, { "epoch": 0.09527381845461365, "grad_norm": 0.9682517051696777, "learning_rate": 1.9049405878674173e-05, "loss": 0.8415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1524, "tokens_per_second_per_gpu": 17158.63, "total_tokens": 150529445 }, { "epoch": 0.09533633408352088, "grad_norm": 0.951356053352356, "learning_rate": 1.9061913696060038e-05, "loss": 0.8462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1525, "tokens_per_second_per_gpu": 18612.56, "total_tokens": 150631675 }, { "epoch": 0.09539884971242811, "grad_norm": 1.063523530960083, "learning_rate": 1.9074421513445903e-05, "loss": 0.9176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1526, "tokens_per_second_per_gpu": 17353.81, "total_tokens": 150730119 }, { "epoch": 0.09546136534133533, "grad_norm": 0.9675443768501282, "learning_rate": 1.908692933083177e-05, "loss": 0.7834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1527, "tokens_per_second_per_gpu": 17028.93, "total_tokens": 150829703 }, { "epoch": 0.09552388097024256, "grad_norm": 0.9965690970420837, "learning_rate": 1.9099437148217637e-05, "loss": 0.8237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1528, "tokens_per_second_per_gpu": 17700.48, "total_tokens": 150928553 }, { "epoch": 0.09558639659914979, "grad_norm": 0.9728791117668152, "learning_rate": 1.9111944965603505e-05, "loss": 0.8388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1529, "tokens_per_second_per_gpu": 16817.38, "total_tokens": 151027775 }, { "epoch": 0.09564891222805702, "grad_norm": 1.0146440267562866, "learning_rate": 1.912445278298937e-05, "loss": 0.8326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1530, "tokens_per_second_per_gpu": 17529.01, "total_tokens": 151128370 }, { "epoch": 0.09571142785696424, "grad_norm": 0.9306171536445618, "learning_rate": 1.9136960600375236e-05, "loss": 0.806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1531, "tokens_per_second_per_gpu": 16710.13, "total_tokens": 151226666 }, { "epoch": 0.09577394348587147, "grad_norm": 1.0668222904205322, "learning_rate": 1.91494684177611e-05, "loss": 0.8234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1532, "tokens_per_second_per_gpu": 16969.95, "total_tokens": 151317701 }, { "epoch": 0.0958364591147787, "grad_norm": 0.9880815148353577, "learning_rate": 1.916197623514697e-05, "loss": 0.8893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1533, "tokens_per_second_per_gpu": 18749.04, "total_tokens": 151419724 }, { "epoch": 0.09589897474368592, "grad_norm": 1.0188829898834229, "learning_rate": 1.9174484052532835e-05, "loss": 0.796, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1534, "tokens_per_second_per_gpu": 17147.47, "total_tokens": 151516633 }, { "epoch": 0.09596149037259315, "grad_norm": 0.9412329196929932, "learning_rate": 1.91869918699187e-05, "loss": 0.7821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1535, "tokens_per_second_per_gpu": 17847.6, "total_tokens": 151617454 }, { "epoch": 0.09602400600150038, "grad_norm": 0.9607561826705933, "learning_rate": 1.9199499687304565e-05, "loss": 0.8165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1536, "tokens_per_second_per_gpu": 18273.33, "total_tokens": 151721451 }, { "epoch": 0.0960865216304076, "grad_norm": 1.076368808746338, "learning_rate": 1.9212007504690434e-05, "loss": 0.84, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1537, "tokens_per_second_per_gpu": 17469.76, "total_tokens": 151821270 }, { "epoch": 0.09614903725931483, "grad_norm": 0.9863638877868652, "learning_rate": 1.92245153220763e-05, "loss": 0.8425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1538, "tokens_per_second_per_gpu": 16594.31, "total_tokens": 151918644 }, { "epoch": 0.09621155288822206, "grad_norm": 1.0148142576217651, "learning_rate": 1.9237023139462164e-05, "loss": 0.7553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1539, "tokens_per_second_per_gpu": 17920.77, "total_tokens": 152017203 }, { "epoch": 0.09627406851712929, "grad_norm": 0.9808884263038635, "learning_rate": 1.9249530956848033e-05, "loss": 0.7707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1540, "tokens_per_second_per_gpu": 16534.55, "total_tokens": 152112738 }, { "epoch": 0.0963365841460365, "grad_norm": 0.9536232352256775, "learning_rate": 1.9262038774233898e-05, "loss": 0.8197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1541, "tokens_per_second_per_gpu": 17921.12, "total_tokens": 152214936 }, { "epoch": 0.09639909977494374, "grad_norm": 0.9643787741661072, "learning_rate": 1.9274546591619766e-05, "loss": 0.8179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1542, "tokens_per_second_per_gpu": 17814.12, "total_tokens": 152315883 }, { "epoch": 0.09646161540385097, "grad_norm": 0.9784713387489319, "learning_rate": 1.928705440900563e-05, "loss": 0.8413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1543, "tokens_per_second_per_gpu": 18376.27, "total_tokens": 152419009 }, { "epoch": 0.09652413103275818, "grad_norm": 0.978935718536377, "learning_rate": 1.9299562226391497e-05, "loss": 0.7848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1544, "tokens_per_second_per_gpu": 16971.83, "total_tokens": 152517477 }, { "epoch": 0.09658664666166542, "grad_norm": 1.0246144533157349, "learning_rate": 1.9312070043777362e-05, "loss": 0.8028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1545, "tokens_per_second_per_gpu": 16657.72, "total_tokens": 152611662 }, { "epoch": 0.09664916229057265, "grad_norm": 0.9359335899353027, "learning_rate": 1.932457786116323e-05, "loss": 0.8262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1546, "tokens_per_second_per_gpu": 17617.12, "total_tokens": 152712966 }, { "epoch": 0.09671167791947986, "grad_norm": 0.9335824251174927, "learning_rate": 1.9337085678549096e-05, "loss": 0.818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1547, "tokens_per_second_per_gpu": 18072.92, "total_tokens": 152814863 }, { "epoch": 0.0967741935483871, "grad_norm": 0.9628713130950928, "learning_rate": 1.934959349593496e-05, "loss": 0.7584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1548, "tokens_per_second_per_gpu": 17071.98, "total_tokens": 152910132 }, { "epoch": 0.09683670917729433, "grad_norm": 0.9251416921615601, "learning_rate": 1.9362101313320826e-05, "loss": 0.7603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1549, "tokens_per_second_per_gpu": 16749.88, "total_tokens": 153007777 }, { "epoch": 0.09689922480620156, "grad_norm": 0.9913697242736816, "learning_rate": 1.9374609130706695e-05, "loss": 0.8225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1550, "tokens_per_second_per_gpu": 16807.81, "total_tokens": 153104764 }, { "epoch": 0.09696174043510877, "grad_norm": 0.9167672395706177, "learning_rate": 1.938711694809256e-05, "loss": 0.785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1551, "tokens_per_second_per_gpu": 16874.42, "total_tokens": 153200386 }, { "epoch": 0.097024256064016, "grad_norm": 0.9345363974571228, "learning_rate": 1.9399624765478425e-05, "loss": 0.8004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1552, "tokens_per_second_per_gpu": 17214.14, "total_tokens": 153300585 }, { "epoch": 0.09708677169292323, "grad_norm": 0.9775597453117371, "learning_rate": 1.941213258286429e-05, "loss": 0.8112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1553, "tokens_per_second_per_gpu": 18025.12, "total_tokens": 153401961 }, { "epoch": 0.09714928732183045, "grad_norm": 0.9678319096565247, "learning_rate": 1.942464040025016e-05, "loss": 0.8118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1554, "tokens_per_second_per_gpu": 16043.19, "total_tokens": 153501429 }, { "epoch": 0.09721180295073768, "grad_norm": 0.9505870938301086, "learning_rate": 1.9437148217636024e-05, "loss": 0.8207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1555, "tokens_per_second_per_gpu": 17610.23, "total_tokens": 153600048 }, { "epoch": 0.09727431857964491, "grad_norm": 0.957719624042511, "learning_rate": 1.9449656035021893e-05, "loss": 0.785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1556, "tokens_per_second_per_gpu": 17830.33, "total_tokens": 153695729 }, { "epoch": 0.09733683420855214, "grad_norm": 0.9936085939407349, "learning_rate": 1.9462163852407754e-05, "loss": 0.8205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1557, "tokens_per_second_per_gpu": 17226.33, "total_tokens": 153791061 }, { "epoch": 0.09739934983745936, "grad_norm": 0.9984171390533447, "learning_rate": 1.9474671669793623e-05, "loss": 0.8707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1558, "tokens_per_second_per_gpu": 17387.95, "total_tokens": 153892776 }, { "epoch": 0.09746186546636659, "grad_norm": 0.967369019985199, "learning_rate": 1.9487179487179488e-05, "loss": 0.7997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1559, "tokens_per_second_per_gpu": 17094.59, "total_tokens": 153992096 }, { "epoch": 0.09752438109527382, "grad_norm": 0.9165000319480896, "learning_rate": 1.9499687304565357e-05, "loss": 0.8109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1560, "tokens_per_second_per_gpu": 17340.36, "total_tokens": 154093306 }, { "epoch": 0.09758689672418104, "grad_norm": 1.0342023372650146, "learning_rate": 1.9512195121951222e-05, "loss": 0.838, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1561, "tokens_per_second_per_gpu": 15833.88, "total_tokens": 154190781 }, { "epoch": 0.09764941235308827, "grad_norm": 1.064800500869751, "learning_rate": 1.9524702939337087e-05, "loss": 0.8089, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1562, "tokens_per_second_per_gpu": 17608.13, "total_tokens": 154290313 }, { "epoch": 0.0977119279819955, "grad_norm": 0.9392610788345337, "learning_rate": 1.9537210756722952e-05, "loss": 0.7904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1563, "tokens_per_second_per_gpu": 16121.53, "total_tokens": 154388482 }, { "epoch": 0.09777444361090272, "grad_norm": 0.9338555932044983, "learning_rate": 1.954971857410882e-05, "loss": 0.8077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1564, "tokens_per_second_per_gpu": 18290.91, "total_tokens": 154489677 }, { "epoch": 0.09783695923980995, "grad_norm": 1.0145496129989624, "learning_rate": 1.9562226391494686e-05, "loss": 0.8083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1565, "tokens_per_second_per_gpu": 17257.04, "total_tokens": 154588489 }, { "epoch": 0.09789947486871718, "grad_norm": 1.0275373458862305, "learning_rate": 1.957473420888055e-05, "loss": 0.826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1566, "tokens_per_second_per_gpu": 17334.77, "total_tokens": 154688549 }, { "epoch": 0.09796199049762441, "grad_norm": 0.9647895097732544, "learning_rate": 1.9587242026266416e-05, "loss": 0.7799, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1567, "tokens_per_second_per_gpu": 17019.16, "total_tokens": 154785471 }, { "epoch": 0.09802450612653163, "grad_norm": 1.0177282094955444, "learning_rate": 1.9599749843652285e-05, "loss": 0.7978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1568, "tokens_per_second_per_gpu": 17453.15, "total_tokens": 154884883 }, { "epoch": 0.09808702175543886, "grad_norm": 0.9489284753799438, "learning_rate": 1.961225766103815e-05, "loss": 0.8059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1569, "tokens_per_second_per_gpu": 17186.72, "total_tokens": 154982362 }, { "epoch": 0.09814953738434609, "grad_norm": 0.9510825276374817, "learning_rate": 1.962476547842402e-05, "loss": 0.7993, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1570, "tokens_per_second_per_gpu": 17769.51, "total_tokens": 155084281 }, { "epoch": 0.09821205301325331, "grad_norm": 1.0032541751861572, "learning_rate": 1.963727329580988e-05, "loss": 0.7592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1571, "tokens_per_second_per_gpu": 18142.48, "total_tokens": 155178976 }, { "epoch": 0.09827456864216054, "grad_norm": 0.9629340767860413, "learning_rate": 1.964978111319575e-05, "loss": 0.8187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1572, "tokens_per_second_per_gpu": 17605.42, "total_tokens": 155280385 }, { "epoch": 0.09833708427106777, "grad_norm": 0.9515058994293213, "learning_rate": 1.9662288930581614e-05, "loss": 0.837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1573, "tokens_per_second_per_gpu": 16770.78, "total_tokens": 155382349 }, { "epoch": 0.09839959989997499, "grad_norm": 0.9901724457740784, "learning_rate": 1.9674796747967483e-05, "loss": 0.7853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1574, "tokens_per_second_per_gpu": 17860.15, "total_tokens": 155480959 }, { "epoch": 0.09846211552888222, "grad_norm": 0.9873204827308655, "learning_rate": 1.9687304565353348e-05, "loss": 0.8576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1575, "tokens_per_second_per_gpu": 17933.52, "total_tokens": 155581989 }, { "epoch": 0.09852463115778945, "grad_norm": 0.9236389994621277, "learning_rate": 1.9699812382739213e-05, "loss": 0.7551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1576, "tokens_per_second_per_gpu": 16499.71, "total_tokens": 155678870 }, { "epoch": 0.09858714678669668, "grad_norm": 0.9659404158592224, "learning_rate": 1.971232020012508e-05, "loss": 0.786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1577, "tokens_per_second_per_gpu": 17472.08, "total_tokens": 155774883 }, { "epoch": 0.0986496624156039, "grad_norm": 0.9786807894706726, "learning_rate": 1.9724828017510947e-05, "loss": 0.8291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1578, "tokens_per_second_per_gpu": 17996.54, "total_tokens": 155877002 }, { "epoch": 0.09871217804451113, "grad_norm": 0.9672409892082214, "learning_rate": 1.9737335834896812e-05, "loss": 0.8086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1579, "tokens_per_second_per_gpu": 17559.88, "total_tokens": 155973456 }, { "epoch": 0.09877469367341836, "grad_norm": 0.9562617540359497, "learning_rate": 1.9749843652282677e-05, "loss": 0.8146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1580, "tokens_per_second_per_gpu": 17413.2, "total_tokens": 156074510 }, { "epoch": 0.09883720930232558, "grad_norm": 0.9707178473472595, "learning_rate": 1.9762351469668543e-05, "loss": 0.8086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1581, "tokens_per_second_per_gpu": 17503.5, "total_tokens": 156174428 }, { "epoch": 0.09889972493123281, "grad_norm": 1.0210930109024048, "learning_rate": 1.977485928705441e-05, "loss": 0.7906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1582, "tokens_per_second_per_gpu": 16197.02, "total_tokens": 156267548 }, { "epoch": 0.09896224056014004, "grad_norm": 0.9210599660873413, "learning_rate": 1.9787367104440276e-05, "loss": 0.8165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1583, "tokens_per_second_per_gpu": 18806.54, "total_tokens": 156372026 }, { "epoch": 0.09902475618904726, "grad_norm": 0.9844116568565369, "learning_rate": 1.979987492182614e-05, "loss": 0.8443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1584, "tokens_per_second_per_gpu": 18477.19, "total_tokens": 156474642 }, { "epoch": 0.09908727181795449, "grad_norm": 0.9690150022506714, "learning_rate": 1.9812382739212007e-05, "loss": 0.8213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1585, "tokens_per_second_per_gpu": 17178.33, "total_tokens": 156573294 }, { "epoch": 0.09914978744686172, "grad_norm": 0.9692025780677795, "learning_rate": 1.9824890556597875e-05, "loss": 0.8145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1586, "tokens_per_second_per_gpu": 17952.28, "total_tokens": 156675648 }, { "epoch": 0.09921230307576895, "grad_norm": 0.9622601866722107, "learning_rate": 1.983739837398374e-05, "loss": 0.8154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1587, "tokens_per_second_per_gpu": 17900.48, "total_tokens": 156778426 }, { "epoch": 0.09927481870467617, "grad_norm": 0.9654762148857117, "learning_rate": 1.984990619136961e-05, "loss": 0.7749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1588, "tokens_per_second_per_gpu": 16246.65, "total_tokens": 156874410 }, { "epoch": 0.0993373343335834, "grad_norm": 0.9483827948570251, "learning_rate": 1.9862414008755474e-05, "loss": 0.8084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1589, "tokens_per_second_per_gpu": 17383.13, "total_tokens": 156976239 }, { "epoch": 0.09939984996249063, "grad_norm": 1.0008115768432617, "learning_rate": 1.987492182614134e-05, "loss": 0.8212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1590, "tokens_per_second_per_gpu": 17864.38, "total_tokens": 157078100 }, { "epoch": 0.09946236559139784, "grad_norm": 1.0576136112213135, "learning_rate": 1.9887429643527205e-05, "loss": 0.7809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1591, "tokens_per_second_per_gpu": 16170.64, "total_tokens": 157170474 }, { "epoch": 0.09952488122030508, "grad_norm": 0.9821868538856506, "learning_rate": 1.9899937460913073e-05, "loss": 0.7736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1592, "tokens_per_second_per_gpu": 17032.38, "total_tokens": 157265751 }, { "epoch": 0.0995873968492123, "grad_norm": 1.0260088443756104, "learning_rate": 1.991244527829894e-05, "loss": 0.8264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1593, "tokens_per_second_per_gpu": 16350.54, "total_tokens": 157361803 }, { "epoch": 0.09964991247811952, "grad_norm": 0.9807103276252747, "learning_rate": 1.9924953095684804e-05, "loss": 0.79, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1594, "tokens_per_second_per_gpu": 17054.46, "total_tokens": 157459299 }, { "epoch": 0.09971242810702675, "grad_norm": 0.9581140279769897, "learning_rate": 1.993746091307067e-05, "loss": 0.7665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1595, "tokens_per_second_per_gpu": 17064.32, "total_tokens": 157557988 }, { "epoch": 0.09977494373593399, "grad_norm": 1.0685861110687256, "learning_rate": 1.9949968730456537e-05, "loss": 0.8469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1596, "tokens_per_second_per_gpu": 17112.28, "total_tokens": 157657036 }, { "epoch": 0.09983745936484122, "grad_norm": 0.940155029296875, "learning_rate": 1.9962476547842403e-05, "loss": 0.7851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1597, "tokens_per_second_per_gpu": 17826.32, "total_tokens": 157759087 }, { "epoch": 0.09989997499374843, "grad_norm": 1.0276092290878296, "learning_rate": 1.9974984365228268e-05, "loss": 0.8063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1598, "tokens_per_second_per_gpu": 17995.74, "total_tokens": 157854002 }, { "epoch": 0.09996249062265566, "grad_norm": 0.9439956545829773, "learning_rate": 1.9987492182614133e-05, "loss": 0.7981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1599, "tokens_per_second_per_gpu": 17344.84, "total_tokens": 157954919 }, { "epoch": 0.1000250062515629, "grad_norm": 0.9645910263061523, "learning_rate": 2e-05, "loss": 0.793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1600, "tokens_per_second_per_gpu": 17554.65, "total_tokens": 158054187 }, { "epoch": 0.10008752188047011, "grad_norm": 1.1715469360351562, "learning_rate": 2e-05, "loss": 0.8492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1601, "tokens_per_second_per_gpu": 17871.09, "total_tokens": 158155596 }, { "epoch": 0.10015003750937734, "grad_norm": 0.999699056148529, "learning_rate": 2e-05, "loss": 0.8076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1602, "tokens_per_second_per_gpu": 18261.16, "total_tokens": 158255234 }, { "epoch": 0.10021255313828457, "grad_norm": 0.9730318188667297, "learning_rate": 2e-05, "loss": 0.8094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1603, "tokens_per_second_per_gpu": 17519.46, "total_tokens": 158354306 }, { "epoch": 0.10027506876719179, "grad_norm": 0.9607117176055908, "learning_rate": 2e-05, "loss": 0.7864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1604, "tokens_per_second_per_gpu": 17627.18, "total_tokens": 158454544 }, { "epoch": 0.10033758439609902, "grad_norm": 0.99571293592453, "learning_rate": 2e-05, "loss": 0.7795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1605, "tokens_per_second_per_gpu": 16814.38, "total_tokens": 158552723 }, { "epoch": 0.10040010002500625, "grad_norm": 0.9764226675033569, "learning_rate": 2e-05, "loss": 0.7677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1606, "tokens_per_second_per_gpu": 17068.06, "total_tokens": 158648089 }, { "epoch": 0.10046261565391348, "grad_norm": 0.9550985097885132, "learning_rate": 2e-05, "loss": 0.7903, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1607, "tokens_per_second_per_gpu": 17113.84, "total_tokens": 158746223 }, { "epoch": 0.1005251312828207, "grad_norm": 0.9761894345283508, "learning_rate": 2e-05, "loss": 0.8409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1608, "tokens_per_second_per_gpu": 17851.26, "total_tokens": 158847780 }, { "epoch": 0.10058764691172793, "grad_norm": 0.9159751534461975, "learning_rate": 2e-05, "loss": 0.795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1609, "tokens_per_second_per_gpu": 18389.12, "total_tokens": 158952544 }, { "epoch": 0.10065016254063516, "grad_norm": 1.02918541431427, "learning_rate": 2e-05, "loss": 0.8329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1610, "tokens_per_second_per_gpu": 17477.46, "total_tokens": 159053929 }, { "epoch": 0.10071267816954238, "grad_norm": 0.9791957139968872, "learning_rate": 2e-05, "loss": 0.7649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1611, "tokens_per_second_per_gpu": 17769.21, "total_tokens": 159151662 }, { "epoch": 0.10077519379844961, "grad_norm": 0.9296393990516663, "learning_rate": 2e-05, "loss": 0.7596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1612, "tokens_per_second_per_gpu": 17685.51, "total_tokens": 159252342 }, { "epoch": 0.10083770942735684, "grad_norm": 0.9672159552574158, "learning_rate": 2e-05, "loss": 0.8298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1613, "tokens_per_second_per_gpu": 17584.0, "total_tokens": 159354320 }, { "epoch": 0.10090022505626407, "grad_norm": 0.9856476187705994, "learning_rate": 2e-05, "loss": 0.7862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1614, "tokens_per_second_per_gpu": 17317.16, "total_tokens": 159451239 }, { "epoch": 0.10096274068517129, "grad_norm": 0.979172945022583, "learning_rate": 2e-05, "loss": 0.8073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1615, "tokens_per_second_per_gpu": 17626.59, "total_tokens": 159550152 }, { "epoch": 0.10102525631407852, "grad_norm": 0.9746542572975159, "learning_rate": 2e-05, "loss": 0.8092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1616, "tokens_per_second_per_gpu": 18522.0, "total_tokens": 159654682 }, { "epoch": 0.10108777194298575, "grad_norm": 0.9663761854171753, "learning_rate": 2e-05, "loss": 0.7543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1617, "tokens_per_second_per_gpu": 17395.24, "total_tokens": 159751230 }, { "epoch": 0.10115028757189297, "grad_norm": 0.9499857425689697, "learning_rate": 2e-05, "loss": 0.7884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1618, "tokens_per_second_per_gpu": 17879.94, "total_tokens": 159852041 }, { "epoch": 0.1012128032008002, "grad_norm": 0.9690098762512207, "learning_rate": 2e-05, "loss": 0.8071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1619, "tokens_per_second_per_gpu": 15788.5, "total_tokens": 159949155 }, { "epoch": 0.10127531882970743, "grad_norm": 1.0147632360458374, "learning_rate": 2e-05, "loss": 0.8116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1620, "tokens_per_second_per_gpu": 17636.97, "total_tokens": 160049369 }, { "epoch": 0.10133783445861465, "grad_norm": 1.0156848430633545, "learning_rate": 2e-05, "loss": 0.8294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1621, "tokens_per_second_per_gpu": 17508.66, "total_tokens": 160147073 }, { "epoch": 0.10140035008752188, "grad_norm": 0.9439324140548706, "learning_rate": 2e-05, "loss": 0.8392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1622, "tokens_per_second_per_gpu": 17479.92, "total_tokens": 160248676 }, { "epoch": 0.10146286571642911, "grad_norm": 0.9800053834915161, "learning_rate": 2e-05, "loss": 0.8514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1623, "tokens_per_second_per_gpu": 18465.21, "total_tokens": 160354547 }, { "epoch": 0.10152538134533634, "grad_norm": 0.949607789516449, "learning_rate": 2e-05, "loss": 0.7763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1624, "tokens_per_second_per_gpu": 17338.84, "total_tokens": 160451787 }, { "epoch": 0.10158789697424356, "grad_norm": 0.9767741560935974, "learning_rate": 2e-05, "loss": 0.852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1625, "tokens_per_second_per_gpu": 18245.02, "total_tokens": 160556564 }, { "epoch": 0.10165041260315079, "grad_norm": 1.0426582098007202, "learning_rate": 2e-05, "loss": 0.7989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1626, "tokens_per_second_per_gpu": 17132.34, "total_tokens": 160654398 }, { "epoch": 0.10171292823205802, "grad_norm": 0.9520421028137207, "learning_rate": 2e-05, "loss": 0.8079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1627, "tokens_per_second_per_gpu": 16935.4, "total_tokens": 160752610 }, { "epoch": 0.10177544386096524, "grad_norm": 0.987126886844635, "learning_rate": 2e-05, "loss": 0.8611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1628, "tokens_per_second_per_gpu": 17335.21, "total_tokens": 160851070 }, { "epoch": 0.10183795948987247, "grad_norm": 0.9487716555595398, "learning_rate": 2e-05, "loss": 0.8098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1629, "tokens_per_second_per_gpu": 17358.32, "total_tokens": 160950678 }, { "epoch": 0.1019004751187797, "grad_norm": 1.0010491609573364, "learning_rate": 2e-05, "loss": 0.8397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1630, "tokens_per_second_per_gpu": 18391.38, "total_tokens": 161052842 }, { "epoch": 0.10196299074768692, "grad_norm": 0.9573408961296082, "learning_rate": 2e-05, "loss": 0.8198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1631, "tokens_per_second_per_gpu": 17034.03, "total_tokens": 161148374 }, { "epoch": 0.10202550637659415, "grad_norm": 0.9803699254989624, "learning_rate": 2e-05, "loss": 0.8424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1632, "tokens_per_second_per_gpu": 16801.24, "total_tokens": 161246811 }, { "epoch": 0.10208802200550138, "grad_norm": 0.9126204252243042, "learning_rate": 2e-05, "loss": 0.7802, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1633, "tokens_per_second_per_gpu": 17464.16, "total_tokens": 161345113 }, { "epoch": 0.10215053763440861, "grad_norm": 0.9530923366546631, "learning_rate": 2e-05, "loss": 0.7992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1634, "tokens_per_second_per_gpu": 16962.93, "total_tokens": 161444596 }, { "epoch": 0.10221305326331583, "grad_norm": 1.0153344869613647, "learning_rate": 2e-05, "loss": 0.7959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1635, "tokens_per_second_per_gpu": 16736.31, "total_tokens": 161540609 }, { "epoch": 0.10227556889222306, "grad_norm": 0.9585055708885193, "learning_rate": 2e-05, "loss": 0.8102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1636, "tokens_per_second_per_gpu": 16816.76, "total_tokens": 161637791 }, { "epoch": 0.10233808452113029, "grad_norm": 1.002997636795044, "learning_rate": 2e-05, "loss": 0.8029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1637, "tokens_per_second_per_gpu": 17069.72, "total_tokens": 161730560 }, { "epoch": 0.1024006001500375, "grad_norm": 0.9902772307395935, "learning_rate": 2e-05, "loss": 0.8198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1638, "tokens_per_second_per_gpu": 17068.79, "total_tokens": 161828970 }, { "epoch": 0.10246311577894474, "grad_norm": 1.0031322240829468, "learning_rate": 2e-05, "loss": 0.7705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1639, "tokens_per_second_per_gpu": 18473.96, "total_tokens": 161929569 }, { "epoch": 0.10252563140785197, "grad_norm": 0.9604263305664062, "learning_rate": 2e-05, "loss": 0.7959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1640, "tokens_per_second_per_gpu": 16386.41, "total_tokens": 162026395 }, { "epoch": 0.10258814703675918, "grad_norm": 0.9633968472480774, "learning_rate": 2e-05, "loss": 0.7949, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1641, "tokens_per_second_per_gpu": 17514.04, "total_tokens": 162121344 }, { "epoch": 0.10265066266566641, "grad_norm": 0.9507586359977722, "learning_rate": 2e-05, "loss": 0.8446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1642, "tokens_per_second_per_gpu": 17833.12, "total_tokens": 162222996 }, { "epoch": 0.10271317829457365, "grad_norm": 0.9933309555053711, "learning_rate": 2e-05, "loss": 0.8588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1643, "tokens_per_second_per_gpu": 17605.74, "total_tokens": 162323778 }, { "epoch": 0.10277569392348088, "grad_norm": 0.9365588426589966, "learning_rate": 2e-05, "loss": 0.7807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1644, "tokens_per_second_per_gpu": 17331.63, "total_tokens": 162425215 }, { "epoch": 0.1028382095523881, "grad_norm": 0.9357625246047974, "learning_rate": 2e-05, "loss": 0.7326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1645, "tokens_per_second_per_gpu": 16914.91, "total_tokens": 162520875 }, { "epoch": 0.10290072518129532, "grad_norm": 0.9432289004325867, "learning_rate": 2e-05, "loss": 0.8304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1646, "tokens_per_second_per_gpu": 17306.18, "total_tokens": 162619504 }, { "epoch": 0.10296324081020256, "grad_norm": 0.9399210214614868, "learning_rate": 2e-05, "loss": 0.8007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1647, "tokens_per_second_per_gpu": 15949.34, "total_tokens": 162716938 }, { "epoch": 0.10302575643910977, "grad_norm": 0.967381477355957, "learning_rate": 2e-05, "loss": 0.8256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1648, "tokens_per_second_per_gpu": 17463.04, "total_tokens": 162817232 }, { "epoch": 0.103088272068017, "grad_norm": 0.9652966856956482, "learning_rate": 2e-05, "loss": 0.78, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1649, "tokens_per_second_per_gpu": 16926.65, "total_tokens": 162913514 }, { "epoch": 0.10315078769692423, "grad_norm": 0.9174568057060242, "learning_rate": 2e-05, "loss": 0.7872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1650, "tokens_per_second_per_gpu": 17748.02, "total_tokens": 163015269 }, { "epoch": 0.10321330332583145, "grad_norm": 0.9606916308403015, "learning_rate": 2e-05, "loss": 0.7642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1651, "tokens_per_second_per_gpu": 17819.33, "total_tokens": 163114806 }, { "epoch": 0.10327581895473868, "grad_norm": 0.9470860362052917, "learning_rate": 2e-05, "loss": 0.8185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1652, "tokens_per_second_per_gpu": 18628.13, "total_tokens": 163217498 }, { "epoch": 0.10333833458364591, "grad_norm": 0.9499413371086121, "learning_rate": 2e-05, "loss": 0.8157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1653, "tokens_per_second_per_gpu": 17471.14, "total_tokens": 163318547 }, { "epoch": 0.10340085021255314, "grad_norm": 0.9719677567481995, "learning_rate": 2e-05, "loss": 0.8169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1654, "tokens_per_second_per_gpu": 17489.43, "total_tokens": 163419006 }, { "epoch": 0.10346336584146036, "grad_norm": 0.9641180634498596, "learning_rate": 2e-05, "loss": 0.7853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1655, "tokens_per_second_per_gpu": 17073.15, "total_tokens": 163518703 }, { "epoch": 0.10352588147036759, "grad_norm": 0.9908487200737, "learning_rate": 2e-05, "loss": 0.7895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1656, "tokens_per_second_per_gpu": 17175.51, "total_tokens": 163612639 }, { "epoch": 0.10358839709927482, "grad_norm": 1.0028159618377686, "learning_rate": 2e-05, "loss": 0.807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1657, "tokens_per_second_per_gpu": 17847.5, "total_tokens": 163712298 }, { "epoch": 0.10365091272818204, "grad_norm": 0.9559161067008972, "learning_rate": 2e-05, "loss": 0.7843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1658, "tokens_per_second_per_gpu": 17717.16, "total_tokens": 163809931 }, { "epoch": 0.10371342835708927, "grad_norm": 0.9326722621917725, "learning_rate": 2e-05, "loss": 0.794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1659, "tokens_per_second_per_gpu": 17593.91, "total_tokens": 163914712 }, { "epoch": 0.1037759439859965, "grad_norm": 0.9101183414459229, "learning_rate": 2e-05, "loss": 0.7885, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1660, "tokens_per_second_per_gpu": 18221.3, "total_tokens": 164018785 }, { "epoch": 0.10383845961490372, "grad_norm": 0.9809156060218811, "learning_rate": 2e-05, "loss": 0.7974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1661, "tokens_per_second_per_gpu": 17464.12, "total_tokens": 164118962 }, { "epoch": 0.10390097524381095, "grad_norm": 0.9639121890068054, "learning_rate": 2e-05, "loss": 0.764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1662, "tokens_per_second_per_gpu": 17199.58, "total_tokens": 164216316 }, { "epoch": 0.10396349087271818, "grad_norm": 1.0243470668792725, "learning_rate": 2e-05, "loss": 0.7702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1663, "tokens_per_second_per_gpu": 16599.91, "total_tokens": 164312153 }, { "epoch": 0.10402600650162541, "grad_norm": 1.0384470224380493, "learning_rate": 2e-05, "loss": 0.7829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1664, "tokens_per_second_per_gpu": 16973.87, "total_tokens": 164408814 }, { "epoch": 0.10408852213053263, "grad_norm": 0.9581444263458252, "learning_rate": 2e-05, "loss": 0.795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1665, "tokens_per_second_per_gpu": 17184.61, "total_tokens": 164505115 }, { "epoch": 0.10415103775943986, "grad_norm": 0.992615818977356, "learning_rate": 2e-05, "loss": 0.7667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1666, "tokens_per_second_per_gpu": 17862.25, "total_tokens": 164603045 }, { "epoch": 0.10421355338834709, "grad_norm": 0.9441550374031067, "learning_rate": 2e-05, "loss": 0.7689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1667, "tokens_per_second_per_gpu": 16314.39, "total_tokens": 164701075 }, { "epoch": 0.10427606901725431, "grad_norm": 0.9978392720222473, "learning_rate": 2e-05, "loss": 0.8786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1668, "tokens_per_second_per_gpu": 18068.76, "total_tokens": 164800131 }, { "epoch": 0.10433858464616154, "grad_norm": 0.9804447889328003, "learning_rate": 2e-05, "loss": 0.7735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1669, "tokens_per_second_per_gpu": 16295.87, "total_tokens": 164898757 }, { "epoch": 0.10440110027506877, "grad_norm": 1.0547150373458862, "learning_rate": 2e-05, "loss": 0.786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1670, "tokens_per_second_per_gpu": 15353.96, "total_tokens": 164992503 }, { "epoch": 0.10446361590397599, "grad_norm": 0.9481258988380432, "learning_rate": 2e-05, "loss": 0.8234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1671, "tokens_per_second_per_gpu": 18496.58, "total_tokens": 165098288 }, { "epoch": 0.10452613153288322, "grad_norm": 0.9730398654937744, "learning_rate": 2e-05, "loss": 0.7603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1672, "tokens_per_second_per_gpu": 17983.7, "total_tokens": 165195404 }, { "epoch": 0.10458864716179045, "grad_norm": 0.9822936654090881, "learning_rate": 2e-05, "loss": 0.8178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1673, "tokens_per_second_per_gpu": 17241.47, "total_tokens": 165293310 }, { "epoch": 0.10465116279069768, "grad_norm": 0.940584123134613, "learning_rate": 2e-05, "loss": 0.8597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1674, "tokens_per_second_per_gpu": 18091.45, "total_tokens": 165396734 }, { "epoch": 0.1047136784196049, "grad_norm": 0.9906832575798035, "learning_rate": 2e-05, "loss": 0.8021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1675, "tokens_per_second_per_gpu": 17192.4, "total_tokens": 165496952 }, { "epoch": 0.10477619404851213, "grad_norm": 0.931049644947052, "learning_rate": 2e-05, "loss": 0.771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1676, "tokens_per_second_per_gpu": 17311.53, "total_tokens": 165595829 }, { "epoch": 0.10483870967741936, "grad_norm": 0.949821949005127, "learning_rate": 2e-05, "loss": 0.8308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1677, "tokens_per_second_per_gpu": 19269.92, "total_tokens": 165698961 }, { "epoch": 0.10490122530632658, "grad_norm": 0.9762237071990967, "learning_rate": 2e-05, "loss": 0.8068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1678, "tokens_per_second_per_gpu": 17382.33, "total_tokens": 165797787 }, { "epoch": 0.1049637409352338, "grad_norm": 1.0150736570358276, "learning_rate": 2e-05, "loss": 0.8738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1679, "tokens_per_second_per_gpu": 17296.84, "total_tokens": 165897981 }, { "epoch": 0.10502625656414104, "grad_norm": 1.003782033920288, "learning_rate": 2e-05, "loss": 0.7826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1680, "tokens_per_second_per_gpu": 17018.68, "total_tokens": 165994170 }, { "epoch": 0.10508877219304827, "grad_norm": 0.9264342784881592, "learning_rate": 2e-05, "loss": 0.769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1681, "tokens_per_second_per_gpu": 17188.97, "total_tokens": 166094335 }, { "epoch": 0.10515128782195549, "grad_norm": 0.9252753853797913, "learning_rate": 2e-05, "loss": 0.7715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1682, "tokens_per_second_per_gpu": 17936.53, "total_tokens": 166192518 }, { "epoch": 0.10521380345086272, "grad_norm": 0.9590672254562378, "learning_rate": 2e-05, "loss": 0.76, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1683, "tokens_per_second_per_gpu": 17372.58, "total_tokens": 166294077 }, { "epoch": 0.10527631907976995, "grad_norm": 0.9549646377563477, "learning_rate": 2e-05, "loss": 0.8222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1684, "tokens_per_second_per_gpu": 18655.01, "total_tokens": 166396197 }, { "epoch": 0.10533883470867716, "grad_norm": 0.9739780426025391, "learning_rate": 2e-05, "loss": 0.829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1685, "tokens_per_second_per_gpu": 17955.97, "total_tokens": 166497088 }, { "epoch": 0.1054013503375844, "grad_norm": 1.0114141702651978, "learning_rate": 2e-05, "loss": 0.7513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1686, "tokens_per_second_per_gpu": 16867.51, "total_tokens": 166592438 }, { "epoch": 0.10546386596649163, "grad_norm": 0.9263441562652588, "learning_rate": 2e-05, "loss": 0.7606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1687, "tokens_per_second_per_gpu": 18334.77, "total_tokens": 166688264 }, { "epoch": 0.10552638159539884, "grad_norm": 0.9871243238449097, "learning_rate": 2e-05, "loss": 0.7845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1688, "tokens_per_second_per_gpu": 16557.12, "total_tokens": 166784535 }, { "epoch": 0.10558889722430607, "grad_norm": 0.9777353405952454, "learning_rate": 2e-05, "loss": 0.7989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1689, "tokens_per_second_per_gpu": 16828.03, "total_tokens": 166878387 }, { "epoch": 0.1056514128532133, "grad_norm": 0.9594184160232544, "learning_rate": 2e-05, "loss": 0.7725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1690, "tokens_per_second_per_gpu": 17834.48, "total_tokens": 166979918 }, { "epoch": 0.10571392848212054, "grad_norm": 0.974335789680481, "learning_rate": 2e-05, "loss": 0.8046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1691, "tokens_per_second_per_gpu": 17744.0, "total_tokens": 167080216 }, { "epoch": 0.10577644411102775, "grad_norm": 0.9437083601951599, "learning_rate": 2e-05, "loss": 0.7861, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1692, "tokens_per_second_per_gpu": 17659.56, "total_tokens": 167181188 }, { "epoch": 0.10583895973993498, "grad_norm": 0.9374207854270935, "learning_rate": 2e-05, "loss": 0.7817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1693, "tokens_per_second_per_gpu": 17111.54, "total_tokens": 167278968 }, { "epoch": 0.10590147536884222, "grad_norm": 1.0227004289627075, "learning_rate": 2e-05, "loss": 0.8227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1694, "tokens_per_second_per_gpu": 18056.34, "total_tokens": 167380146 }, { "epoch": 0.10596399099774943, "grad_norm": 0.9965002536773682, "learning_rate": 2e-05, "loss": 0.8079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1695, "tokens_per_second_per_gpu": 17684.87, "total_tokens": 167479143 }, { "epoch": 0.10602650662665666, "grad_norm": 0.9892390966415405, "learning_rate": 2e-05, "loss": 0.7865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1696, "tokens_per_second_per_gpu": 16767.21, "total_tokens": 167574021 }, { "epoch": 0.1060890222555639, "grad_norm": 0.9961382746696472, "learning_rate": 2e-05, "loss": 0.8276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1697, "tokens_per_second_per_gpu": 18323.48, "total_tokens": 167675731 }, { "epoch": 0.10615153788447111, "grad_norm": 0.9971263408660889, "learning_rate": 2e-05, "loss": 0.7979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1698, "tokens_per_second_per_gpu": 18056.48, "total_tokens": 167773725 }, { "epoch": 0.10621405351337834, "grad_norm": 1.0382544994354248, "learning_rate": 2e-05, "loss": 0.7468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1699, "tokens_per_second_per_gpu": 18147.03, "total_tokens": 167873688 }, { "epoch": 0.10627656914228557, "grad_norm": 0.936788022518158, "learning_rate": 2e-05, "loss": 0.7814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1700, "tokens_per_second_per_gpu": 17866.68, "total_tokens": 167974244 }, { "epoch": 0.1063390847711928, "grad_norm": 0.958412230014801, "learning_rate": 2e-05, "loss": 0.8224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1701, "tokens_per_second_per_gpu": 17267.42, "total_tokens": 168074069 }, { "epoch": 0.10640160040010002, "grad_norm": 1.0028541088104248, "learning_rate": 2e-05, "loss": 0.8161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1702, "tokens_per_second_per_gpu": 17477.75, "total_tokens": 168175157 }, { "epoch": 0.10646411602900725, "grad_norm": 1.0321539640426636, "learning_rate": 2e-05, "loss": 0.8231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1703, "tokens_per_second_per_gpu": 17142.25, "total_tokens": 168275259 }, { "epoch": 0.10652663165791448, "grad_norm": 1.0274068117141724, "learning_rate": 2e-05, "loss": 0.7945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1704, "tokens_per_second_per_gpu": 16200.07, "total_tokens": 168369911 }, { "epoch": 0.1065891472868217, "grad_norm": 0.9841079711914062, "learning_rate": 2e-05, "loss": 0.8739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1705, "tokens_per_second_per_gpu": 18194.18, "total_tokens": 168472527 }, { "epoch": 0.10665166291572893, "grad_norm": 1.046546220779419, "learning_rate": 2e-05, "loss": 0.8657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1706, "tokens_per_second_per_gpu": 16813.08, "total_tokens": 168571848 }, { "epoch": 0.10671417854463616, "grad_norm": 1.001078724861145, "learning_rate": 2e-05, "loss": 0.8065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1707, "tokens_per_second_per_gpu": 17177.95, "total_tokens": 168669363 }, { "epoch": 0.10677669417354338, "grad_norm": 1.009695291519165, "learning_rate": 2e-05, "loss": 0.8069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1708, "tokens_per_second_per_gpu": 17171.08, "total_tokens": 168766334 }, { "epoch": 0.10683920980245061, "grad_norm": 0.964839518070221, "learning_rate": 2e-05, "loss": 0.8131, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1709, "tokens_per_second_per_gpu": 17322.53, "total_tokens": 168864004 }, { "epoch": 0.10690172543135784, "grad_norm": 1.0038995742797852, "learning_rate": 2e-05, "loss": 0.7845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1710, "tokens_per_second_per_gpu": 18342.94, "total_tokens": 168963418 }, { "epoch": 0.10696424106026507, "grad_norm": 0.9863675832748413, "learning_rate": 2e-05, "loss": 0.7513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1711, "tokens_per_second_per_gpu": 17080.95, "total_tokens": 169059512 }, { "epoch": 0.10702675668917229, "grad_norm": 1.0226118564605713, "learning_rate": 2e-05, "loss": 0.7817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1712, "tokens_per_second_per_gpu": 17450.52, "total_tokens": 169155646 }, { "epoch": 0.10708927231807952, "grad_norm": 0.9966205954551697, "learning_rate": 2e-05, "loss": 0.8268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1713, "tokens_per_second_per_gpu": 16864.33, "total_tokens": 169254821 }, { "epoch": 0.10715178794698675, "grad_norm": 0.9537058472633362, "learning_rate": 2e-05, "loss": 0.7868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1714, "tokens_per_second_per_gpu": 18211.38, "total_tokens": 169356414 }, { "epoch": 0.10721430357589397, "grad_norm": 1.0290732383728027, "learning_rate": 2e-05, "loss": 0.8246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1715, "tokens_per_second_per_gpu": 17906.27, "total_tokens": 169459049 }, { "epoch": 0.1072768192048012, "grad_norm": 1.002218246459961, "learning_rate": 2e-05, "loss": 0.8302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1716, "tokens_per_second_per_gpu": 17212.61, "total_tokens": 169555815 }, { "epoch": 0.10733933483370843, "grad_norm": 1.0204476118087769, "learning_rate": 2e-05, "loss": 0.7995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1717, "tokens_per_second_per_gpu": 16397.12, "total_tokens": 169652648 }, { "epoch": 0.10740185046261565, "grad_norm": 0.9820974469184875, "learning_rate": 2e-05, "loss": 0.8461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1718, "tokens_per_second_per_gpu": 18789.52, "total_tokens": 169757144 }, { "epoch": 0.10746436609152288, "grad_norm": 1.0055874586105347, "learning_rate": 2e-05, "loss": 0.792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1719, "tokens_per_second_per_gpu": 17619.15, "total_tokens": 169856245 }, { "epoch": 0.10752688172043011, "grad_norm": 0.9474127292633057, "learning_rate": 2e-05, "loss": 0.7639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1720, "tokens_per_second_per_gpu": 18221.85, "total_tokens": 169956751 }, { "epoch": 0.10758939734933734, "grad_norm": 0.9953257441520691, "learning_rate": 2e-05, "loss": 0.819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1721, "tokens_per_second_per_gpu": 16257.19, "total_tokens": 170052294 }, { "epoch": 0.10765191297824456, "grad_norm": 1.0213000774383545, "learning_rate": 2e-05, "loss": 0.7978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1722, "tokens_per_second_per_gpu": 16954.9, "total_tokens": 170145559 }, { "epoch": 0.10771442860715179, "grad_norm": 0.9521053433418274, "learning_rate": 2e-05, "loss": 0.7671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1723, "tokens_per_second_per_gpu": 18737.36, "total_tokens": 170247726 }, { "epoch": 0.10777694423605902, "grad_norm": 0.9445487856864929, "learning_rate": 2e-05, "loss": 0.7798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1724, "tokens_per_second_per_gpu": 17330.8, "total_tokens": 170345469 }, { "epoch": 0.10783945986496624, "grad_norm": 0.9562898278236389, "learning_rate": 2e-05, "loss": 0.7918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1725, "tokens_per_second_per_gpu": 17547.15, "total_tokens": 170444778 }, { "epoch": 0.10790197549387347, "grad_norm": 0.9913601279258728, "learning_rate": 2e-05, "loss": 0.7748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1726, "tokens_per_second_per_gpu": 16855.61, "total_tokens": 170541078 }, { "epoch": 0.1079644911227807, "grad_norm": 0.939502477645874, "learning_rate": 2e-05, "loss": 0.7723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1727, "tokens_per_second_per_gpu": 17283.68, "total_tokens": 170639576 }, { "epoch": 0.10802700675168792, "grad_norm": 0.9351162314414978, "learning_rate": 2e-05, "loss": 0.7813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1728, "tokens_per_second_per_gpu": 16826.69, "total_tokens": 170738858 }, { "epoch": 0.10808952238059515, "grad_norm": 0.9782776236534119, "learning_rate": 2e-05, "loss": 0.7672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1729, "tokens_per_second_per_gpu": 17057.47, "total_tokens": 170832532 }, { "epoch": 0.10815203800950238, "grad_norm": 0.9238916039466858, "learning_rate": 2e-05, "loss": 0.7757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1730, "tokens_per_second_per_gpu": 17946.5, "total_tokens": 170932519 }, { "epoch": 0.10821455363840961, "grad_norm": 0.932828962802887, "learning_rate": 2e-05, "loss": 0.7903, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1731, "tokens_per_second_per_gpu": 18231.88, "total_tokens": 171033717 }, { "epoch": 0.10827706926731683, "grad_norm": 0.9545599818229675, "learning_rate": 2e-05, "loss": 0.7855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1732, "tokens_per_second_per_gpu": 18937.51, "total_tokens": 171134790 }, { "epoch": 0.10833958489622406, "grad_norm": 0.9455687403678894, "learning_rate": 2e-05, "loss": 0.753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1733, "tokens_per_second_per_gpu": 18028.96, "total_tokens": 171232286 }, { "epoch": 0.10840210052513129, "grad_norm": 0.9792025089263916, "learning_rate": 2e-05, "loss": 0.8188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1734, "tokens_per_second_per_gpu": 17702.12, "total_tokens": 171330833 }, { "epoch": 0.1084646161540385, "grad_norm": 0.9671949148178101, "learning_rate": 2e-05, "loss": 0.7999, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1735, "tokens_per_second_per_gpu": 18040.59, "total_tokens": 171434250 }, { "epoch": 0.10852713178294573, "grad_norm": 0.9287935495376587, "learning_rate": 2e-05, "loss": 0.8562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1736, "tokens_per_second_per_gpu": 18251.7, "total_tokens": 171536397 }, { "epoch": 0.10858964741185297, "grad_norm": 0.9783524870872498, "learning_rate": 2e-05, "loss": 0.7716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1737, "tokens_per_second_per_gpu": 18169.5, "total_tokens": 171630969 }, { "epoch": 0.1086521630407602, "grad_norm": 0.9479551911354065, "learning_rate": 2e-05, "loss": 0.7731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1738, "tokens_per_second_per_gpu": 16799.1, "total_tokens": 171729334 }, { "epoch": 0.10871467866966741, "grad_norm": 0.9849837422370911, "learning_rate": 2e-05, "loss": 0.8412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1739, "tokens_per_second_per_gpu": 17317.8, "total_tokens": 171830340 }, { "epoch": 0.10877719429857464, "grad_norm": 0.9439547657966614, "learning_rate": 2e-05, "loss": 0.7995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1740, "tokens_per_second_per_gpu": 18656.6, "total_tokens": 171936290 }, { "epoch": 0.10883970992748188, "grad_norm": 0.9427766799926758, "learning_rate": 2e-05, "loss": 0.7657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1741, "tokens_per_second_per_gpu": 17492.0, "total_tokens": 172035038 }, { "epoch": 0.10890222555638909, "grad_norm": 0.9505995512008667, "learning_rate": 2e-05, "loss": 0.7804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1742, "tokens_per_second_per_gpu": 16910.36, "total_tokens": 172134686 }, { "epoch": 0.10896474118529632, "grad_norm": 0.9710884690284729, "learning_rate": 2e-05, "loss": 0.7998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1743, "tokens_per_second_per_gpu": 17722.63, "total_tokens": 172233849 }, { "epoch": 0.10902725681420355, "grad_norm": 0.9391371607780457, "learning_rate": 2e-05, "loss": 0.7984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1744, "tokens_per_second_per_gpu": 16886.33, "total_tokens": 172330909 }, { "epoch": 0.10908977244311077, "grad_norm": 0.9887187480926514, "learning_rate": 2e-05, "loss": 0.8163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1745, "tokens_per_second_per_gpu": 17968.71, "total_tokens": 172428373 }, { "epoch": 0.109152288072018, "grad_norm": 0.9787053465843201, "learning_rate": 2e-05, "loss": 0.7749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1746, "tokens_per_second_per_gpu": 16794.48, "total_tokens": 172524584 }, { "epoch": 0.10921480370092523, "grad_norm": 0.968660831451416, "learning_rate": 2e-05, "loss": 0.8056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1747, "tokens_per_second_per_gpu": 17838.61, "total_tokens": 172625306 }, { "epoch": 0.10927731932983246, "grad_norm": 0.9511692523956299, "learning_rate": 2e-05, "loss": 0.7951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1748, "tokens_per_second_per_gpu": 18008.62, "total_tokens": 172727214 }, { "epoch": 0.10933983495873968, "grad_norm": 0.9424843192100525, "learning_rate": 2e-05, "loss": 0.7528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1749, "tokens_per_second_per_gpu": 16290.31, "total_tokens": 172822987 }, { "epoch": 0.10940235058764691, "grad_norm": 0.9674400687217712, "learning_rate": 2e-05, "loss": 0.8303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1750, "tokens_per_second_per_gpu": 16798.02, "total_tokens": 172918822 }, { "epoch": 0.10946486621655414, "grad_norm": 0.9617502689361572, "learning_rate": 2e-05, "loss": 0.7671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1751, "tokens_per_second_per_gpu": 16886.07, "total_tokens": 173015556 }, { "epoch": 0.10952738184546136, "grad_norm": 0.9733071327209473, "learning_rate": 2e-05, "loss": 0.7445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1752, "tokens_per_second_per_gpu": 16420.76, "total_tokens": 173104435 }, { "epoch": 0.10958989747436859, "grad_norm": 0.9167875051498413, "learning_rate": 2e-05, "loss": 0.7782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1753, "tokens_per_second_per_gpu": 17528.19, "total_tokens": 173207220 }, { "epoch": 0.10965241310327582, "grad_norm": 1.0209935903549194, "learning_rate": 2e-05, "loss": 0.7561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1754, "tokens_per_second_per_gpu": 17813.95, "total_tokens": 173301664 }, { "epoch": 0.10971492873218304, "grad_norm": 0.9681236743927002, "learning_rate": 2e-05, "loss": 0.7908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1755, "tokens_per_second_per_gpu": 17666.14, "total_tokens": 173400158 }, { "epoch": 0.10977744436109027, "grad_norm": 0.9448840618133545, "learning_rate": 2e-05, "loss": 0.7815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1756, "tokens_per_second_per_gpu": 17592.11, "total_tokens": 173501751 }, { "epoch": 0.1098399599899975, "grad_norm": 0.9867289066314697, "learning_rate": 2e-05, "loss": 0.7668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1757, "tokens_per_second_per_gpu": 17607.3, "total_tokens": 173598168 }, { "epoch": 0.10990247561890473, "grad_norm": 0.9390624165534973, "learning_rate": 2e-05, "loss": 0.8079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1758, "tokens_per_second_per_gpu": 18156.54, "total_tokens": 173698874 }, { "epoch": 0.10996499124781195, "grad_norm": 0.9269596934318542, "learning_rate": 2e-05, "loss": 0.7787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1759, "tokens_per_second_per_gpu": 17541.83, "total_tokens": 173797777 }, { "epoch": 0.11002750687671918, "grad_norm": 0.9707646369934082, "learning_rate": 2e-05, "loss": 0.8265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1760, "tokens_per_second_per_gpu": 16783.38, "total_tokens": 173895974 }, { "epoch": 0.11009002250562641, "grad_norm": 0.9356089234352112, "learning_rate": 2e-05, "loss": 0.807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1761, "tokens_per_second_per_gpu": 18087.19, "total_tokens": 173999560 }, { "epoch": 0.11015253813453363, "grad_norm": 0.9606069326400757, "learning_rate": 2e-05, "loss": 0.7747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1762, "tokens_per_second_per_gpu": 17464.91, "total_tokens": 174100763 }, { "epoch": 0.11021505376344086, "grad_norm": 0.9557055830955505, "learning_rate": 2e-05, "loss": 0.7936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1763, "tokens_per_second_per_gpu": 17105.32, "total_tokens": 174202078 }, { "epoch": 0.11027756939234809, "grad_norm": 0.9344338774681091, "learning_rate": 2e-05, "loss": 0.8058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1764, "tokens_per_second_per_gpu": 16547.01, "total_tokens": 174300525 }, { "epoch": 0.11034008502125531, "grad_norm": 0.9524967670440674, "learning_rate": 2e-05, "loss": 0.7851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1765, "tokens_per_second_per_gpu": 16318.82, "total_tokens": 174396027 }, { "epoch": 0.11040260065016254, "grad_norm": 0.954206645488739, "learning_rate": 2e-05, "loss": 0.8057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1766, "tokens_per_second_per_gpu": 17555.31, "total_tokens": 174494979 }, { "epoch": 0.11046511627906977, "grad_norm": 0.9035621881484985, "learning_rate": 2e-05, "loss": 0.7658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1767, "tokens_per_second_per_gpu": 18133.16, "total_tokens": 174596617 }, { "epoch": 0.110527631907977, "grad_norm": 0.9757164120674133, "learning_rate": 2e-05, "loss": 0.75, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1768, "tokens_per_second_per_gpu": 16724.55, "total_tokens": 174691440 }, { "epoch": 0.11059014753688422, "grad_norm": 0.9688217043876648, "learning_rate": 2e-05, "loss": 0.7607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1769, "tokens_per_second_per_gpu": 17114.54, "total_tokens": 174789822 }, { "epoch": 0.11065266316579145, "grad_norm": 0.9173604846000671, "learning_rate": 2e-05, "loss": 0.7715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1770, "tokens_per_second_per_gpu": 18299.5, "total_tokens": 174890314 }, { "epoch": 0.11071517879469868, "grad_norm": 0.9881820678710938, "learning_rate": 2e-05, "loss": 0.795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1771, "tokens_per_second_per_gpu": 16908.33, "total_tokens": 174986622 }, { "epoch": 0.1107776944236059, "grad_norm": 0.937915563583374, "learning_rate": 2e-05, "loss": 0.7625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1772, "tokens_per_second_per_gpu": 18212.17, "total_tokens": 175089394 }, { "epoch": 0.11084021005251313, "grad_norm": 0.996738851070404, "learning_rate": 2e-05, "loss": 0.8452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1773, "tokens_per_second_per_gpu": 18129.21, "total_tokens": 175188916 }, { "epoch": 0.11090272568142036, "grad_norm": 0.9476285576820374, "learning_rate": 2e-05, "loss": 0.7784, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1774, "tokens_per_second_per_gpu": 18269.13, "total_tokens": 175290398 }, { "epoch": 0.11096524131032758, "grad_norm": 1.0316481590270996, "learning_rate": 2e-05, "loss": 0.7907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1775, "tokens_per_second_per_gpu": 17710.86, "total_tokens": 175390133 }, { "epoch": 0.1110277569392348, "grad_norm": 0.9384517669677734, "learning_rate": 2e-05, "loss": 0.8087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1776, "tokens_per_second_per_gpu": 17558.64, "total_tokens": 175492931 }, { "epoch": 0.11109027256814204, "grad_norm": 0.9844269156455994, "learning_rate": 2e-05, "loss": 0.8024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1777, "tokens_per_second_per_gpu": 17239.33, "total_tokens": 175589629 }, { "epoch": 0.11115278819704927, "grad_norm": 0.9392343759536743, "learning_rate": 2e-05, "loss": 0.7938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1778, "tokens_per_second_per_gpu": 16449.61, "total_tokens": 175684559 }, { "epoch": 0.11121530382595649, "grad_norm": 0.9387767314910889, "learning_rate": 2e-05, "loss": 0.7718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1779, "tokens_per_second_per_gpu": 17422.85, "total_tokens": 175784603 }, { "epoch": 0.11127781945486372, "grad_norm": 0.9317959547042847, "learning_rate": 2e-05, "loss": 0.7611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1780, "tokens_per_second_per_gpu": 17488.78, "total_tokens": 175881534 }, { "epoch": 0.11134033508377095, "grad_norm": 0.93868488073349, "learning_rate": 2e-05, "loss": 0.7573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1781, "tokens_per_second_per_gpu": 16606.14, "total_tokens": 175975952 }, { "epoch": 0.11140285071267816, "grad_norm": 0.9069148898124695, "learning_rate": 2e-05, "loss": 0.7935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1782, "tokens_per_second_per_gpu": 18141.31, "total_tokens": 176080506 }, { "epoch": 0.1114653663415854, "grad_norm": 0.9177560210227966, "learning_rate": 2e-05, "loss": 0.7761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1783, "tokens_per_second_per_gpu": 18330.99, "total_tokens": 176183206 }, { "epoch": 0.11152788197049263, "grad_norm": 0.9827948808670044, "learning_rate": 2e-05, "loss": 0.8347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1784, "tokens_per_second_per_gpu": 18782.54, "total_tokens": 176287140 }, { "epoch": 0.11159039759939984, "grad_norm": 0.9806962609291077, "learning_rate": 2e-05, "loss": 0.8443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1785, "tokens_per_second_per_gpu": 18244.63, "total_tokens": 176391863 }, { "epoch": 0.11165291322830707, "grad_norm": 0.9605928659439087, "learning_rate": 2e-05, "loss": 0.7646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1786, "tokens_per_second_per_gpu": 16605.87, "total_tokens": 176490399 }, { "epoch": 0.1117154288572143, "grad_norm": 0.954898476600647, "learning_rate": 2e-05, "loss": 0.7861, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1787, "tokens_per_second_per_gpu": 16345.9, "total_tokens": 176586537 }, { "epoch": 0.11177794448612154, "grad_norm": 0.9366920590400696, "learning_rate": 2e-05, "loss": 0.7834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1788, "tokens_per_second_per_gpu": 17338.91, "total_tokens": 176685432 }, { "epoch": 0.11184046011502875, "grad_norm": 0.8826562762260437, "learning_rate": 2e-05, "loss": 0.7788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1789, "tokens_per_second_per_gpu": 18489.08, "total_tokens": 176791815 }, { "epoch": 0.11190297574393598, "grad_norm": 0.98198401927948, "learning_rate": 2e-05, "loss": 0.8117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1790, "tokens_per_second_per_gpu": 17144.37, "total_tokens": 176891312 }, { "epoch": 0.11196549137284321, "grad_norm": 0.8969886302947998, "learning_rate": 2e-05, "loss": 0.7902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1791, "tokens_per_second_per_gpu": 18216.83, "total_tokens": 176995261 }, { "epoch": 0.11202800700175043, "grad_norm": 0.9378143548965454, "learning_rate": 2e-05, "loss": 0.7751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1792, "tokens_per_second_per_gpu": 17862.51, "total_tokens": 177096292 }, { "epoch": 0.11209052263065766, "grad_norm": 0.9608869552612305, "learning_rate": 2e-05, "loss": 0.7686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1793, "tokens_per_second_per_gpu": 17026.82, "total_tokens": 177193847 }, { "epoch": 0.1121530382595649, "grad_norm": 0.9778483510017395, "learning_rate": 2e-05, "loss": 0.7629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1794, "tokens_per_second_per_gpu": 17424.32, "total_tokens": 177293985 }, { "epoch": 0.11221555388847212, "grad_norm": 0.9453094005584717, "learning_rate": 2e-05, "loss": 0.8181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1795, "tokens_per_second_per_gpu": 18004.04, "total_tokens": 177399260 }, { "epoch": 0.11227806951737934, "grad_norm": 0.9451631307601929, "learning_rate": 2e-05, "loss": 0.7832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1796, "tokens_per_second_per_gpu": 17377.44, "total_tokens": 177501182 }, { "epoch": 0.11234058514628657, "grad_norm": 0.962753176689148, "learning_rate": 2e-05, "loss": 0.7954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1797, "tokens_per_second_per_gpu": 16387.96, "total_tokens": 177595035 }, { "epoch": 0.1124031007751938, "grad_norm": 0.9136858582496643, "learning_rate": 2e-05, "loss": 0.7736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1798, "tokens_per_second_per_gpu": 17595.57, "total_tokens": 177692759 }, { "epoch": 0.11246561640410102, "grad_norm": 0.9153572916984558, "learning_rate": 2e-05, "loss": 0.7185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1799, "tokens_per_second_per_gpu": 17844.87, "total_tokens": 177787384 }, { "epoch": 0.11252813203300825, "grad_norm": 0.9530633687973022, "learning_rate": 2e-05, "loss": 0.8017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1800, "tokens_per_second_per_gpu": 16982.4, "total_tokens": 177888243 }, { "epoch": 0.11259064766191548, "grad_norm": 0.9739862084388733, "learning_rate": 2e-05, "loss": 0.8118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1801, "tokens_per_second_per_gpu": 16190.69, "total_tokens": 177979259 }, { "epoch": 0.1126531632908227, "grad_norm": 0.9887681007385254, "learning_rate": 2e-05, "loss": 0.776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1802, "tokens_per_second_per_gpu": 16606.48, "total_tokens": 178075571 }, { "epoch": 0.11271567891972993, "grad_norm": 0.9366166591644287, "learning_rate": 2e-05, "loss": 0.7455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1803, "tokens_per_second_per_gpu": 16856.45, "total_tokens": 178169426 }, { "epoch": 0.11277819454863716, "grad_norm": 0.9538989067077637, "learning_rate": 2e-05, "loss": 0.779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1804, "tokens_per_second_per_gpu": 17254.92, "total_tokens": 178263559 }, { "epoch": 0.11284071017754439, "grad_norm": 0.9659048914909363, "learning_rate": 2e-05, "loss": 0.8108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1805, "tokens_per_second_per_gpu": 17922.12, "total_tokens": 178363688 }, { "epoch": 0.11290322580645161, "grad_norm": 0.9992122054100037, "learning_rate": 2e-05, "loss": 0.7822, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1806, "tokens_per_second_per_gpu": 15995.32, "total_tokens": 178459962 }, { "epoch": 0.11296574143535884, "grad_norm": 0.9455481171607971, "learning_rate": 2e-05, "loss": 0.8006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1807, "tokens_per_second_per_gpu": 17454.36, "total_tokens": 178561280 }, { "epoch": 0.11302825706426607, "grad_norm": 0.9274150133132935, "learning_rate": 2e-05, "loss": 0.8045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1808, "tokens_per_second_per_gpu": 16720.02, "total_tokens": 178659905 }, { "epoch": 0.11309077269317329, "grad_norm": 0.9595226049423218, "learning_rate": 2e-05, "loss": 0.7966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1809, "tokens_per_second_per_gpu": 18645.05, "total_tokens": 178761727 }, { "epoch": 0.11315328832208052, "grad_norm": 0.9648453593254089, "learning_rate": 2e-05, "loss": 0.8282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1810, "tokens_per_second_per_gpu": 17975.54, "total_tokens": 178863089 }, { "epoch": 0.11321580395098775, "grad_norm": 0.9377473592758179, "learning_rate": 2e-05, "loss": 0.7599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1811, "tokens_per_second_per_gpu": 16462.38, "total_tokens": 178959877 }, { "epoch": 0.11327831957989497, "grad_norm": 0.9306060075759888, "learning_rate": 2e-05, "loss": 0.7921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1812, "tokens_per_second_per_gpu": 17246.34, "total_tokens": 179057679 }, { "epoch": 0.1133408352088022, "grad_norm": 1.0110992193222046, "learning_rate": 2e-05, "loss": 0.8279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1813, "tokens_per_second_per_gpu": 18099.73, "total_tokens": 179156845 }, { "epoch": 0.11340335083770943, "grad_norm": 0.9778465628623962, "learning_rate": 2e-05, "loss": 0.7994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1814, "tokens_per_second_per_gpu": 18332.43, "total_tokens": 179258814 }, { "epoch": 0.11346586646661666, "grad_norm": 0.9806351661682129, "learning_rate": 2e-05, "loss": 0.8054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1815, "tokens_per_second_per_gpu": 18258.94, "total_tokens": 179362562 }, { "epoch": 0.11352838209552388, "grad_norm": 0.9905957579612732, "learning_rate": 2e-05, "loss": 0.7921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1816, "tokens_per_second_per_gpu": 17518.02, "total_tokens": 179459078 }, { "epoch": 0.11359089772443111, "grad_norm": 0.9670182466506958, "learning_rate": 2e-05, "loss": 0.7654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1817, "tokens_per_second_per_gpu": 17385.72, "total_tokens": 179558791 }, { "epoch": 0.11365341335333834, "grad_norm": 0.9341626763343811, "learning_rate": 2e-05, "loss": 0.8273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1818, "tokens_per_second_per_gpu": 18388.12, "total_tokens": 179660752 }, { "epoch": 0.11371592898224556, "grad_norm": 0.9550228714942932, "learning_rate": 2e-05, "loss": 0.7936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1819, "tokens_per_second_per_gpu": 17345.69, "total_tokens": 179757781 }, { "epoch": 0.11377844461115279, "grad_norm": 0.9637555480003357, "learning_rate": 2e-05, "loss": 0.7929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1820, "tokens_per_second_per_gpu": 15963.5, "total_tokens": 179851819 }, { "epoch": 0.11384096024006002, "grad_norm": 0.9519586563110352, "learning_rate": 2e-05, "loss": 0.8294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1821, "tokens_per_second_per_gpu": 18238.46, "total_tokens": 179954231 }, { "epoch": 0.11390347586896724, "grad_norm": 0.9473772048950195, "learning_rate": 2e-05, "loss": 0.8001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1822, "tokens_per_second_per_gpu": 17459.71, "total_tokens": 180058472 }, { "epoch": 0.11396599149787447, "grad_norm": 0.9326053857803345, "learning_rate": 2e-05, "loss": 0.785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1823, "tokens_per_second_per_gpu": 18033.32, "total_tokens": 180160381 }, { "epoch": 0.1140285071267817, "grad_norm": 0.9416096210479736, "learning_rate": 2e-05, "loss": 0.7815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1824, "tokens_per_second_per_gpu": 18163.99, "total_tokens": 180260023 }, { "epoch": 0.11409102275568893, "grad_norm": 0.9491218328475952, "learning_rate": 2e-05, "loss": 0.7575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1825, "tokens_per_second_per_gpu": 16944.11, "total_tokens": 180355055 }, { "epoch": 0.11415353838459615, "grad_norm": 1.028613805770874, "learning_rate": 2e-05, "loss": 0.8165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1826, "tokens_per_second_per_gpu": 18105.72, "total_tokens": 180454807 }, { "epoch": 0.11421605401350338, "grad_norm": 0.9899656772613525, "learning_rate": 2e-05, "loss": 0.7962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1827, "tokens_per_second_per_gpu": 16421.17, "total_tokens": 180551280 }, { "epoch": 0.11427856964241061, "grad_norm": 0.9573479294776917, "learning_rate": 2e-05, "loss": 0.7994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1828, "tokens_per_second_per_gpu": 17497.07, "total_tokens": 180651471 }, { "epoch": 0.11434108527131782, "grad_norm": 0.9401739239692688, "learning_rate": 2e-05, "loss": 0.7911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1829, "tokens_per_second_per_gpu": 17796.88, "total_tokens": 180750676 }, { "epoch": 0.11440360090022506, "grad_norm": 0.9710729122161865, "learning_rate": 2e-05, "loss": 0.8132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1830, "tokens_per_second_per_gpu": 17866.04, "total_tokens": 180852652 }, { "epoch": 0.11446611652913229, "grad_norm": 1.0070366859436035, "learning_rate": 2e-05, "loss": 0.765, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1831, "tokens_per_second_per_gpu": 18002.22, "total_tokens": 180950877 }, { "epoch": 0.1145286321580395, "grad_norm": 0.9682475924491882, "learning_rate": 2e-05, "loss": 0.7724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1832, "tokens_per_second_per_gpu": 17365.47, "total_tokens": 181048663 }, { "epoch": 0.11459114778694673, "grad_norm": 0.9283250570297241, "learning_rate": 2e-05, "loss": 0.7522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1833, "tokens_per_second_per_gpu": 17930.05, "total_tokens": 181147932 }, { "epoch": 0.11465366341585397, "grad_norm": 1.0039527416229248, "learning_rate": 2e-05, "loss": 0.7896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1834, "tokens_per_second_per_gpu": 16495.38, "total_tokens": 181245068 }, { "epoch": 0.1147161790447612, "grad_norm": 0.9904265999794006, "learning_rate": 2e-05, "loss": 0.7925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1835, "tokens_per_second_per_gpu": 17692.39, "total_tokens": 181345086 }, { "epoch": 0.11477869467366841, "grad_norm": 0.9642106890678406, "learning_rate": 2e-05, "loss": 0.7785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1836, "tokens_per_second_per_gpu": 16813.49, "total_tokens": 181444054 }, { "epoch": 0.11484121030257564, "grad_norm": 0.9542462825775146, "learning_rate": 2e-05, "loss": 0.761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1837, "tokens_per_second_per_gpu": 17260.25, "total_tokens": 181543654 }, { "epoch": 0.11490372593148288, "grad_norm": 1.0362435579299927, "learning_rate": 2e-05, "loss": 0.8023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1838, "tokens_per_second_per_gpu": 17094.96, "total_tokens": 181640728 }, { "epoch": 0.11496624156039009, "grad_norm": 0.9196813106536865, "learning_rate": 2e-05, "loss": 0.726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1839, "tokens_per_second_per_gpu": 17135.7, "total_tokens": 181738167 }, { "epoch": 0.11502875718929732, "grad_norm": 0.9262503385543823, "learning_rate": 2e-05, "loss": 0.838, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1840, "tokens_per_second_per_gpu": 17196.54, "total_tokens": 181839543 }, { "epoch": 0.11509127281820455, "grad_norm": 0.9746553897857666, "learning_rate": 2e-05, "loss": 0.7646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1841, "tokens_per_second_per_gpu": 17704.34, "total_tokens": 181936547 }, { "epoch": 0.11515378844711177, "grad_norm": 0.9530996084213257, "learning_rate": 2e-05, "loss": 0.7861, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1842, "tokens_per_second_per_gpu": 16457.87, "total_tokens": 182033751 }, { "epoch": 0.115216304076019, "grad_norm": 0.926037609577179, "learning_rate": 2e-05, "loss": 0.7847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1843, "tokens_per_second_per_gpu": 17348.22, "total_tokens": 182132004 }, { "epoch": 0.11527881970492623, "grad_norm": 0.98553466796875, "learning_rate": 2e-05, "loss": 0.7698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1844, "tokens_per_second_per_gpu": 17506.58, "total_tokens": 182226506 }, { "epoch": 0.11534133533383346, "grad_norm": 0.9654994010925293, "learning_rate": 2e-05, "loss": 0.7492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1845, "tokens_per_second_per_gpu": 17809.14, "total_tokens": 182323006 }, { "epoch": 0.11540385096274068, "grad_norm": 0.9431560039520264, "learning_rate": 2e-05, "loss": 0.7303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1846, "tokens_per_second_per_gpu": 16292.59, "total_tokens": 182419155 }, { "epoch": 0.11546636659164791, "grad_norm": 0.9357687830924988, "learning_rate": 2e-05, "loss": 0.7319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1847, "tokens_per_second_per_gpu": 16506.98, "total_tokens": 182516922 }, { "epoch": 0.11552888222055514, "grad_norm": 0.966159462928772, "learning_rate": 2e-05, "loss": 0.7741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1848, "tokens_per_second_per_gpu": 17468.29, "total_tokens": 182616133 }, { "epoch": 0.11559139784946236, "grad_norm": 0.975761353969574, "learning_rate": 2e-05, "loss": 0.7483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1849, "tokens_per_second_per_gpu": 17767.67, "total_tokens": 182711104 }, { "epoch": 0.11565391347836959, "grad_norm": 0.9650139212608337, "learning_rate": 2e-05, "loss": 0.7936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1850, "tokens_per_second_per_gpu": 17218.82, "total_tokens": 182809235 }, { "epoch": 0.11571642910727682, "grad_norm": 0.935028076171875, "learning_rate": 2e-05, "loss": 0.7897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1851, "tokens_per_second_per_gpu": 16777.81, "total_tokens": 182909025 }, { "epoch": 0.11577894473618405, "grad_norm": 1.1172081232070923, "learning_rate": 2e-05, "loss": 0.7964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1852, "tokens_per_second_per_gpu": 17411.74, "total_tokens": 183009591 }, { "epoch": 0.11584146036509127, "grad_norm": 0.9644613862037659, "learning_rate": 2e-05, "loss": 0.759, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1853, "tokens_per_second_per_gpu": 17137.46, "total_tokens": 183107036 }, { "epoch": 0.1159039759939985, "grad_norm": 0.9560644626617432, "learning_rate": 2e-05, "loss": 0.7967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1854, "tokens_per_second_per_gpu": 16053.74, "total_tokens": 183204348 }, { "epoch": 0.11596649162290573, "grad_norm": 0.9623815417289734, "learning_rate": 2e-05, "loss": 0.789, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1855, "tokens_per_second_per_gpu": 16866.8, "total_tokens": 183303900 }, { "epoch": 0.11602900725181295, "grad_norm": 0.9730267524719238, "learning_rate": 2e-05, "loss": 0.77, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1856, "tokens_per_second_per_gpu": 16835.01, "total_tokens": 183396517 }, { "epoch": 0.11609152288072018, "grad_norm": 0.9976636171340942, "learning_rate": 2e-05, "loss": 0.8288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1857, "tokens_per_second_per_gpu": 18454.51, "total_tokens": 183499113 }, { "epoch": 0.11615403850962741, "grad_norm": 0.896805465221405, "learning_rate": 2e-05, "loss": 0.7944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1858, "tokens_per_second_per_gpu": 18479.67, "total_tokens": 183600452 }, { "epoch": 0.11621655413853463, "grad_norm": 0.9698087573051453, "learning_rate": 2e-05, "loss": 0.7589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1859, "tokens_per_second_per_gpu": 16926.9, "total_tokens": 183698592 }, { "epoch": 0.11627906976744186, "grad_norm": 0.9498625993728638, "learning_rate": 2e-05, "loss": 0.7487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1860, "tokens_per_second_per_gpu": 17984.1, "total_tokens": 183796560 }, { "epoch": 0.11634158539634909, "grad_norm": 0.992955207824707, "learning_rate": 2e-05, "loss": 0.7931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1861, "tokens_per_second_per_gpu": 16698.42, "total_tokens": 183892396 }, { "epoch": 0.11640410102525632, "grad_norm": 0.9083285331726074, "learning_rate": 2e-05, "loss": 0.7707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1862, "tokens_per_second_per_gpu": 17072.38, "total_tokens": 183994866 }, { "epoch": 0.11646661665416354, "grad_norm": 0.9198442101478577, "learning_rate": 2e-05, "loss": 0.7597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1863, "tokens_per_second_per_gpu": 17192.2, "total_tokens": 184091669 }, { "epoch": 0.11652913228307077, "grad_norm": 0.9130765199661255, "learning_rate": 2e-05, "loss": 0.792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1864, "tokens_per_second_per_gpu": 17523.83, "total_tokens": 184189537 }, { "epoch": 0.116591647911978, "grad_norm": 0.9444694519042969, "learning_rate": 2e-05, "loss": 0.7946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1865, "tokens_per_second_per_gpu": 17261.38, "total_tokens": 184289810 }, { "epoch": 0.11665416354088522, "grad_norm": 0.9167326092720032, "learning_rate": 2e-05, "loss": 0.7778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1866, "tokens_per_second_per_gpu": 17207.39, "total_tokens": 184389206 }, { "epoch": 0.11671667916979245, "grad_norm": 0.9782905578613281, "learning_rate": 2e-05, "loss": 0.8387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1867, "tokens_per_second_per_gpu": 17680.5, "total_tokens": 184490777 }, { "epoch": 0.11677919479869968, "grad_norm": 0.9566085934638977, "learning_rate": 2e-05, "loss": 0.7601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1868, "tokens_per_second_per_gpu": 15879.85, "total_tokens": 184585526 }, { "epoch": 0.1168417104276069, "grad_norm": 0.9516909122467041, "learning_rate": 2e-05, "loss": 0.8442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1869, "tokens_per_second_per_gpu": 16887.35, "total_tokens": 184682456 }, { "epoch": 0.11690422605651413, "grad_norm": 0.9017894268035889, "learning_rate": 2e-05, "loss": 0.7645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1870, "tokens_per_second_per_gpu": 18140.93, "total_tokens": 184782409 }, { "epoch": 0.11696674168542136, "grad_norm": 0.9617956280708313, "learning_rate": 2e-05, "loss": 0.8217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1871, "tokens_per_second_per_gpu": 17426.88, "total_tokens": 184877627 }, { "epoch": 0.11702925731432859, "grad_norm": 0.962515115737915, "learning_rate": 2e-05, "loss": 0.814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1872, "tokens_per_second_per_gpu": 17089.52, "total_tokens": 184975803 }, { "epoch": 0.1170917729432358, "grad_norm": 0.9190242886543274, "learning_rate": 2e-05, "loss": 0.745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1873, "tokens_per_second_per_gpu": 17604.63, "total_tokens": 185070959 }, { "epoch": 0.11715428857214304, "grad_norm": 0.9398333430290222, "learning_rate": 2e-05, "loss": 0.7815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1874, "tokens_per_second_per_gpu": 17162.91, "total_tokens": 185169582 }, { "epoch": 0.11721680420105027, "grad_norm": 0.9279540777206421, "learning_rate": 2e-05, "loss": 0.784, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1875, "tokens_per_second_per_gpu": 18231.98, "total_tokens": 185268636 }, { "epoch": 0.11727931982995748, "grad_norm": 0.8972029685974121, "learning_rate": 2e-05, "loss": 0.7857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1876, "tokens_per_second_per_gpu": 18284.14, "total_tokens": 185373010 }, { "epoch": 0.11734183545886472, "grad_norm": 0.9687308669090271, "learning_rate": 2e-05, "loss": 0.7127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1877, "tokens_per_second_per_gpu": 17926.76, "total_tokens": 185467867 }, { "epoch": 0.11740435108777195, "grad_norm": 1.017829418182373, "learning_rate": 2e-05, "loss": 0.7933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1878, "tokens_per_second_per_gpu": 17137.61, "total_tokens": 185567555 }, { "epoch": 0.11746686671667916, "grad_norm": 0.9262449741363525, "learning_rate": 2e-05, "loss": 0.7649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1879, "tokens_per_second_per_gpu": 17214.93, "total_tokens": 185666176 }, { "epoch": 0.1175293823455864, "grad_norm": 0.9280195832252502, "learning_rate": 2e-05, "loss": 0.8011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1880, "tokens_per_second_per_gpu": 18173.69, "total_tokens": 185769714 }, { "epoch": 0.11759189797449363, "grad_norm": 0.9539895057678223, "learning_rate": 2e-05, "loss": 0.7729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1881, "tokens_per_second_per_gpu": 16815.33, "total_tokens": 185862613 }, { "epoch": 0.11765441360340086, "grad_norm": 0.9194240570068359, "learning_rate": 2e-05, "loss": 0.7834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1882, "tokens_per_second_per_gpu": 18641.31, "total_tokens": 185965986 }, { "epoch": 0.11771692923230807, "grad_norm": 0.9408594369888306, "learning_rate": 2e-05, "loss": 0.7951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1883, "tokens_per_second_per_gpu": 18311.79, "total_tokens": 186067093 }, { "epoch": 0.1177794448612153, "grad_norm": 0.9565678834915161, "learning_rate": 2e-05, "loss": 0.7797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1884, "tokens_per_second_per_gpu": 15690.84, "total_tokens": 186162240 }, { "epoch": 0.11784196049012254, "grad_norm": 0.9560222029685974, "learning_rate": 2e-05, "loss": 0.7838, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1885, "tokens_per_second_per_gpu": 17430.82, "total_tokens": 186260059 }, { "epoch": 0.11790447611902975, "grad_norm": 0.9333645105361938, "learning_rate": 2e-05, "loss": 0.7709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1886, "tokens_per_second_per_gpu": 18008.25, "total_tokens": 186360395 }, { "epoch": 0.11796699174793698, "grad_norm": 1.0138545036315918, "learning_rate": 2e-05, "loss": 0.7807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1887, "tokens_per_second_per_gpu": 18153.66, "total_tokens": 186457890 }, { "epoch": 0.11802950737684421, "grad_norm": 0.9449602961540222, "learning_rate": 2e-05, "loss": 0.7874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1888, "tokens_per_second_per_gpu": 17731.75, "total_tokens": 186557217 }, { "epoch": 0.11809202300575143, "grad_norm": 0.9276763796806335, "learning_rate": 2e-05, "loss": 0.7794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1889, "tokens_per_second_per_gpu": 18075.97, "total_tokens": 186658111 }, { "epoch": 0.11815453863465866, "grad_norm": 0.969375729560852, "learning_rate": 2e-05, "loss": 0.7989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1890, "tokens_per_second_per_gpu": 18018.72, "total_tokens": 186756980 }, { "epoch": 0.1182170542635659, "grad_norm": 0.9404190182685852, "learning_rate": 2e-05, "loss": 0.8133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1891, "tokens_per_second_per_gpu": 17397.81, "total_tokens": 186855589 }, { "epoch": 0.11827956989247312, "grad_norm": 0.9617750644683838, "learning_rate": 2e-05, "loss": 0.7845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1892, "tokens_per_second_per_gpu": 18140.19, "total_tokens": 186954611 }, { "epoch": 0.11834208552138034, "grad_norm": 0.9642019867897034, "learning_rate": 2e-05, "loss": 0.8109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1893, "tokens_per_second_per_gpu": 16866.2, "total_tokens": 187052518 }, { "epoch": 0.11840460115028757, "grad_norm": 1.0076743364334106, "learning_rate": 2e-05, "loss": 0.7394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1894, "tokens_per_second_per_gpu": 16523.33, "total_tokens": 187140778 }, { "epoch": 0.1184671167791948, "grad_norm": 0.9293689727783203, "learning_rate": 2e-05, "loss": 0.7685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1895, "tokens_per_second_per_gpu": 17022.38, "total_tokens": 187240489 }, { "epoch": 0.11852963240810202, "grad_norm": 0.9729219675064087, "learning_rate": 2e-05, "loss": 0.8242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1896, "tokens_per_second_per_gpu": 18140.52, "total_tokens": 187341912 }, { "epoch": 0.11859214803700925, "grad_norm": 0.950587809085846, "learning_rate": 2e-05, "loss": 0.7355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1897, "tokens_per_second_per_gpu": 16852.63, "total_tokens": 187436723 }, { "epoch": 0.11865466366591648, "grad_norm": 0.9494062066078186, "learning_rate": 2e-05, "loss": 0.7477, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1898, "tokens_per_second_per_gpu": 17362.01, "total_tokens": 187534876 }, { "epoch": 0.1187171792948237, "grad_norm": 0.973320722579956, "learning_rate": 2e-05, "loss": 0.7734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1899, "tokens_per_second_per_gpu": 16749.85, "total_tokens": 187629661 }, { "epoch": 0.11877969492373093, "grad_norm": 0.9483040571212769, "learning_rate": 2e-05, "loss": 0.7893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1900, "tokens_per_second_per_gpu": 17084.01, "total_tokens": 187725923 }, { "epoch": 0.11884221055263816, "grad_norm": 0.9362061023712158, "learning_rate": 2e-05, "loss": 0.8089, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1901, "tokens_per_second_per_gpu": 17893.79, "total_tokens": 187823106 }, { "epoch": 0.11890472618154539, "grad_norm": 0.9381709694862366, "learning_rate": 2e-05, "loss": 0.7606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1902, "tokens_per_second_per_gpu": 17797.42, "total_tokens": 187922877 }, { "epoch": 0.11896724181045261, "grad_norm": 0.9257428050041199, "learning_rate": 2e-05, "loss": 0.766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1903, "tokens_per_second_per_gpu": 18014.57, "total_tokens": 188024215 }, { "epoch": 0.11902975743935984, "grad_norm": 0.9878075122833252, "learning_rate": 2e-05, "loss": 0.772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1904, "tokens_per_second_per_gpu": 17899.04, "total_tokens": 188122620 }, { "epoch": 0.11909227306826707, "grad_norm": 0.9908477663993835, "learning_rate": 2e-05, "loss": 0.7868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1905, "tokens_per_second_per_gpu": 17346.96, "total_tokens": 188221969 }, { "epoch": 0.11915478869717429, "grad_norm": 0.9649039506912231, "learning_rate": 2e-05, "loss": 0.7928, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1906, "tokens_per_second_per_gpu": 16243.9, "total_tokens": 188319681 }, { "epoch": 0.11921730432608152, "grad_norm": 0.9489207863807678, "learning_rate": 2e-05, "loss": 0.7626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1907, "tokens_per_second_per_gpu": 16575.21, "total_tokens": 188413767 }, { "epoch": 0.11927981995498875, "grad_norm": 0.9761773347854614, "learning_rate": 2e-05, "loss": 0.778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1908, "tokens_per_second_per_gpu": 17218.96, "total_tokens": 188512334 }, { "epoch": 0.11934233558389597, "grad_norm": 0.9939227104187012, "learning_rate": 2e-05, "loss": 0.8058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1909, "tokens_per_second_per_gpu": 17441.36, "total_tokens": 188608695 }, { "epoch": 0.1194048512128032, "grad_norm": 0.9301954507827759, "learning_rate": 2e-05, "loss": 0.7607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1910, "tokens_per_second_per_gpu": 16092.98, "total_tokens": 188704857 }, { "epoch": 0.11946736684171043, "grad_norm": 0.9718918800354004, "learning_rate": 2e-05, "loss": 0.7825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1911, "tokens_per_second_per_gpu": 15913.6, "total_tokens": 188797939 }, { "epoch": 0.11952988247061766, "grad_norm": 0.9137513637542725, "learning_rate": 2e-05, "loss": 0.7638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1912, "tokens_per_second_per_gpu": 17419.64, "total_tokens": 188897747 }, { "epoch": 0.11959239809952488, "grad_norm": 0.9609819650650024, "learning_rate": 2e-05, "loss": 0.8359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1913, "tokens_per_second_per_gpu": 17952.64, "total_tokens": 188997380 }, { "epoch": 0.11965491372843211, "grad_norm": 0.9960382580757141, "learning_rate": 2e-05, "loss": 0.7877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1914, "tokens_per_second_per_gpu": 17490.36, "total_tokens": 189096459 }, { "epoch": 0.11971742935733934, "grad_norm": 0.9447379112243652, "learning_rate": 2e-05, "loss": 0.7704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1915, "tokens_per_second_per_gpu": 17422.19, "total_tokens": 189190382 }, { "epoch": 0.11977994498624656, "grad_norm": 0.9876344203948975, "learning_rate": 2e-05, "loss": 0.7654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1916, "tokens_per_second_per_gpu": 15841.31, "total_tokens": 189283072 }, { "epoch": 0.11984246061515379, "grad_norm": 0.9350468516349792, "learning_rate": 2e-05, "loss": 0.8104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1917, "tokens_per_second_per_gpu": 17746.16, "total_tokens": 189387461 }, { "epoch": 0.11990497624406102, "grad_norm": 0.9406555891036987, "learning_rate": 2e-05, "loss": 0.755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1918, "tokens_per_second_per_gpu": 16826.08, "total_tokens": 189486757 }, { "epoch": 0.11996749187296825, "grad_norm": 0.9232341051101685, "learning_rate": 2e-05, "loss": 0.7907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1919, "tokens_per_second_per_gpu": 18228.26, "total_tokens": 189586590 }, { "epoch": 0.12003000750187547, "grad_norm": 0.95257169008255, "learning_rate": 2e-05, "loss": 0.7574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1920, "tokens_per_second_per_gpu": 17324.64, "total_tokens": 189685224 }, { "epoch": 0.1200925231307827, "grad_norm": 0.9892132878303528, "learning_rate": 2e-05, "loss": 0.8117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1921, "tokens_per_second_per_gpu": 16726.64, "total_tokens": 189780234 }, { "epoch": 0.12015503875968993, "grad_norm": 0.9524456262588501, "learning_rate": 2e-05, "loss": 0.7837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1922, "tokens_per_second_per_gpu": 17175.82, "total_tokens": 189879036 }, { "epoch": 0.12021755438859714, "grad_norm": 0.9616438746452332, "learning_rate": 2e-05, "loss": 0.8354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1923, "tokens_per_second_per_gpu": 17785.9, "total_tokens": 189980187 }, { "epoch": 0.12028007001750438, "grad_norm": 0.9669979214668274, "learning_rate": 2e-05, "loss": 0.7573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1924, "tokens_per_second_per_gpu": 17054.95, "total_tokens": 190073777 }, { "epoch": 0.1203425856464116, "grad_norm": 0.9537477493286133, "learning_rate": 2e-05, "loss": 0.7402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1925, "tokens_per_second_per_gpu": 17125.14, "total_tokens": 190169752 }, { "epoch": 0.12040510127531882, "grad_norm": 0.9268317222595215, "learning_rate": 2e-05, "loss": 0.7628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1926, "tokens_per_second_per_gpu": 16540.94, "total_tokens": 190268994 }, { "epoch": 0.12046761690422605, "grad_norm": 0.8976914286613464, "learning_rate": 2e-05, "loss": 0.7338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1927, "tokens_per_second_per_gpu": 17517.91, "total_tokens": 190367107 }, { "epoch": 0.12053013253313329, "grad_norm": 0.948339581489563, "learning_rate": 2e-05, "loss": 0.8079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1928, "tokens_per_second_per_gpu": 17195.22, "total_tokens": 190468111 }, { "epoch": 0.12059264816204052, "grad_norm": 0.9619932174682617, "learning_rate": 2e-05, "loss": 0.7859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1929, "tokens_per_second_per_gpu": 16961.72, "total_tokens": 190563946 }, { "epoch": 0.12065516379094773, "grad_norm": 0.9517277479171753, "learning_rate": 2e-05, "loss": 0.7405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1930, "tokens_per_second_per_gpu": 16445.36, "total_tokens": 190658148 }, { "epoch": 0.12071767941985496, "grad_norm": 0.9587910771369934, "learning_rate": 2e-05, "loss": 0.7131, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1931, "tokens_per_second_per_gpu": 16399.69, "total_tokens": 190751665 }, { "epoch": 0.1207801950487622, "grad_norm": 0.9686313271522522, "learning_rate": 2e-05, "loss": 0.8047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1932, "tokens_per_second_per_gpu": 16873.15, "total_tokens": 190848565 }, { "epoch": 0.12084271067766941, "grad_norm": 0.9201074838638306, "learning_rate": 2e-05, "loss": 0.7675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1933, "tokens_per_second_per_gpu": 18178.52, "total_tokens": 190948796 }, { "epoch": 0.12090522630657664, "grad_norm": 0.9438064098358154, "learning_rate": 2e-05, "loss": 0.7727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1934, "tokens_per_second_per_gpu": 17470.62, "total_tokens": 191048073 }, { "epoch": 0.12096774193548387, "grad_norm": 0.960120439529419, "learning_rate": 2e-05, "loss": 0.7669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1935, "tokens_per_second_per_gpu": 17886.58, "total_tokens": 191145078 }, { "epoch": 0.12103025756439109, "grad_norm": 0.9331203103065491, "learning_rate": 2e-05, "loss": 0.8039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1936, "tokens_per_second_per_gpu": 17586.16, "total_tokens": 191243299 }, { "epoch": 0.12109277319329832, "grad_norm": 0.9193524122238159, "learning_rate": 2e-05, "loss": 0.7801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1937, "tokens_per_second_per_gpu": 17688.63, "total_tokens": 191341307 }, { "epoch": 0.12115528882220555, "grad_norm": 0.9186713695526123, "learning_rate": 2e-05, "loss": 0.742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1938, "tokens_per_second_per_gpu": 16632.39, "total_tokens": 191437323 }, { "epoch": 0.12121780445111278, "grad_norm": 0.8928077220916748, "learning_rate": 2e-05, "loss": 0.7904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1939, "tokens_per_second_per_gpu": 17816.13, "total_tokens": 191541193 }, { "epoch": 0.12128032008002, "grad_norm": 0.9590166211128235, "learning_rate": 2e-05, "loss": 0.7891, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1940, "tokens_per_second_per_gpu": 16746.07, "total_tokens": 191640864 }, { "epoch": 0.12134283570892723, "grad_norm": 0.9639939665794373, "learning_rate": 2e-05, "loss": 0.7795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1941, "tokens_per_second_per_gpu": 16541.13, "total_tokens": 191737062 }, { "epoch": 0.12140535133783446, "grad_norm": 0.9882084131240845, "learning_rate": 2e-05, "loss": 0.7607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1942, "tokens_per_second_per_gpu": 15956.79, "total_tokens": 191830420 }, { "epoch": 0.12146786696674168, "grad_norm": 0.9455439448356628, "learning_rate": 2e-05, "loss": 0.8083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1943, "tokens_per_second_per_gpu": 17583.41, "total_tokens": 191932427 }, { "epoch": 0.12153038259564891, "grad_norm": 0.9575437903404236, "learning_rate": 2e-05, "loss": 0.7606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1944, "tokens_per_second_per_gpu": 17505.45, "total_tokens": 192030913 }, { "epoch": 0.12159289822455614, "grad_norm": 1.031419277191162, "learning_rate": 2e-05, "loss": 0.8004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1945, "tokens_per_second_per_gpu": 16942.47, "total_tokens": 192123938 }, { "epoch": 0.12165541385346336, "grad_norm": 0.9784757494926453, "learning_rate": 2e-05, "loss": 0.7522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1946, "tokens_per_second_per_gpu": 17009.13, "total_tokens": 192221922 }, { "epoch": 0.12171792948237059, "grad_norm": 0.919151246547699, "learning_rate": 2e-05, "loss": 0.7762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1947, "tokens_per_second_per_gpu": 17290.52, "total_tokens": 192322737 }, { "epoch": 0.12178044511127782, "grad_norm": 0.9473093748092651, "learning_rate": 2e-05, "loss": 0.7711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1948, "tokens_per_second_per_gpu": 17257.81, "total_tokens": 192419238 }, { "epoch": 0.12184296074018505, "grad_norm": 0.9453833699226379, "learning_rate": 2e-05, "loss": 0.7644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1949, "tokens_per_second_per_gpu": 18307.12, "total_tokens": 192515970 }, { "epoch": 0.12190547636909227, "grad_norm": 0.9390974044799805, "learning_rate": 2e-05, "loss": 0.7611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1950, "tokens_per_second_per_gpu": 16771.46, "total_tokens": 192612987 }, { "epoch": 0.1219679919979995, "grad_norm": 0.9540968537330627, "learning_rate": 2e-05, "loss": 0.8056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1951, "tokens_per_second_per_gpu": 17813.73, "total_tokens": 192714944 }, { "epoch": 0.12203050762690673, "grad_norm": 0.9572805166244507, "learning_rate": 2e-05, "loss": 0.7634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1952, "tokens_per_second_per_gpu": 17414.09, "total_tokens": 192812612 }, { "epoch": 0.12209302325581395, "grad_norm": 0.9322742223739624, "learning_rate": 2e-05, "loss": 0.782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1953, "tokens_per_second_per_gpu": 18405.38, "total_tokens": 192916722 }, { "epoch": 0.12215553888472118, "grad_norm": 0.9586840271949768, "learning_rate": 2e-05, "loss": 0.7801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1954, "tokens_per_second_per_gpu": 16709.64, "total_tokens": 193009401 }, { "epoch": 0.12221805451362841, "grad_norm": 0.9973980188369751, "learning_rate": 2e-05, "loss": 0.7854, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1955, "tokens_per_second_per_gpu": 17231.57, "total_tokens": 193109290 }, { "epoch": 0.12228057014253563, "grad_norm": 0.9313364028930664, "learning_rate": 2e-05, "loss": 0.7516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1956, "tokens_per_second_per_gpu": 15458.65, "total_tokens": 193203269 }, { "epoch": 0.12234308577144286, "grad_norm": 0.9569217562675476, "learning_rate": 2e-05, "loss": 0.7527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1957, "tokens_per_second_per_gpu": 15349.54, "total_tokens": 193297220 }, { "epoch": 0.12240560140035009, "grad_norm": 0.953876256942749, "learning_rate": 2e-05, "loss": 0.7886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1958, "tokens_per_second_per_gpu": 17640.07, "total_tokens": 193396594 }, { "epoch": 0.12246811702925732, "grad_norm": 0.9303469657897949, "learning_rate": 2e-05, "loss": 0.7852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1959, "tokens_per_second_per_gpu": 17741.66, "total_tokens": 193494957 }, { "epoch": 0.12253063265816454, "grad_norm": 0.9632790684700012, "learning_rate": 2e-05, "loss": 0.7867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1960, "tokens_per_second_per_gpu": 16281.01, "total_tokens": 193588916 }, { "epoch": 0.12259314828707177, "grad_norm": 0.9097832441329956, "learning_rate": 2e-05, "loss": 0.7706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1961, "tokens_per_second_per_gpu": 16727.14, "total_tokens": 193687402 }, { "epoch": 0.122655663915979, "grad_norm": 0.9042445421218872, "learning_rate": 2e-05, "loss": 0.7971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1962, "tokens_per_second_per_gpu": 18428.85, "total_tokens": 193790370 }, { "epoch": 0.12271817954488622, "grad_norm": 0.9282596707344055, "learning_rate": 2e-05, "loss": 0.7576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1963, "tokens_per_second_per_gpu": 17063.27, "total_tokens": 193889213 }, { "epoch": 0.12278069517379345, "grad_norm": 0.9372159242630005, "learning_rate": 2e-05, "loss": 0.7481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1964, "tokens_per_second_per_gpu": 18186.4, "total_tokens": 193988103 }, { "epoch": 0.12284321080270068, "grad_norm": 0.9824783802032471, "learning_rate": 2e-05, "loss": 0.7299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1965, "tokens_per_second_per_gpu": 17534.18, "total_tokens": 194087705 }, { "epoch": 0.1229057264316079, "grad_norm": 0.9537524580955505, "learning_rate": 2e-05, "loss": 0.7961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1966, "tokens_per_second_per_gpu": 17122.66, "total_tokens": 194186731 }, { "epoch": 0.12296824206051513, "grad_norm": 0.9384617805480957, "learning_rate": 2e-05, "loss": 0.7608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1967, "tokens_per_second_per_gpu": 16402.16, "total_tokens": 194283202 }, { "epoch": 0.12303075768942236, "grad_norm": 0.9710803031921387, "learning_rate": 2e-05, "loss": 0.7586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1968, "tokens_per_second_per_gpu": 18067.33, "total_tokens": 194382567 }, { "epoch": 0.12309327331832959, "grad_norm": 0.9323428273200989, "learning_rate": 2e-05, "loss": 0.759, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1969, "tokens_per_second_per_gpu": 17371.2, "total_tokens": 194481339 }, { "epoch": 0.1231557889472368, "grad_norm": 0.9809176325798035, "learning_rate": 2e-05, "loss": 0.794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1970, "tokens_per_second_per_gpu": 16921.5, "total_tokens": 194577773 }, { "epoch": 0.12321830457614404, "grad_norm": 0.9513590335845947, "learning_rate": 2e-05, "loss": 0.7594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1971, "tokens_per_second_per_gpu": 17380.07, "total_tokens": 194676477 }, { "epoch": 0.12328082020505127, "grad_norm": 0.9432438015937805, "learning_rate": 2e-05, "loss": 0.7163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1972, "tokens_per_second_per_gpu": 17254.61, "total_tokens": 194772779 }, { "epoch": 0.12334333583395848, "grad_norm": 0.9790890216827393, "learning_rate": 2e-05, "loss": 0.7183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1973, "tokens_per_second_per_gpu": 16269.99, "total_tokens": 194870839 }, { "epoch": 0.12340585146286571, "grad_norm": 0.9220632314682007, "learning_rate": 2e-05, "loss": 0.7869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1974, "tokens_per_second_per_gpu": 17167.77, "total_tokens": 194971887 }, { "epoch": 0.12346836709177295, "grad_norm": 0.9222463965415955, "learning_rate": 2e-05, "loss": 0.7775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1975, "tokens_per_second_per_gpu": 17825.02, "total_tokens": 195073422 }, { "epoch": 0.12353088272068018, "grad_norm": 0.932209849357605, "learning_rate": 2e-05, "loss": 0.7974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1976, "tokens_per_second_per_gpu": 17392.52, "total_tokens": 195176120 }, { "epoch": 0.1235933983495874, "grad_norm": 0.9073517322540283, "learning_rate": 2e-05, "loss": 0.7871, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1977, "tokens_per_second_per_gpu": 17305.2, "total_tokens": 195275625 }, { "epoch": 0.12365591397849462, "grad_norm": 0.919720470905304, "learning_rate": 2e-05, "loss": 0.7916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1978, "tokens_per_second_per_gpu": 18486.07, "total_tokens": 195381840 }, { "epoch": 0.12371842960740186, "grad_norm": 0.9688857197761536, "learning_rate": 2e-05, "loss": 0.7566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1979, "tokens_per_second_per_gpu": 17283.99, "total_tokens": 195477673 }, { "epoch": 0.12378094523630907, "grad_norm": 0.9566540718078613, "learning_rate": 2e-05, "loss": 0.7754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1980, "tokens_per_second_per_gpu": 16563.25, "total_tokens": 195573669 }, { "epoch": 0.1238434608652163, "grad_norm": 0.9552305340766907, "learning_rate": 2e-05, "loss": 0.7879, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1981, "tokens_per_second_per_gpu": 17430.88, "total_tokens": 195671044 }, { "epoch": 0.12390597649412353, "grad_norm": 0.9303783178329468, "learning_rate": 2e-05, "loss": 0.7801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1982, "tokens_per_second_per_gpu": 17199.91, "total_tokens": 195768446 }, { "epoch": 0.12396849212303075, "grad_norm": 0.9507697224617004, "learning_rate": 2e-05, "loss": 0.7776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1983, "tokens_per_second_per_gpu": 17186.88, "total_tokens": 195865190 }, { "epoch": 0.12403100775193798, "grad_norm": 0.9730367064476013, "learning_rate": 2e-05, "loss": 0.7782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1984, "tokens_per_second_per_gpu": 16948.38, "total_tokens": 195962720 }, { "epoch": 0.12409352338084521, "grad_norm": 0.9407357573509216, "learning_rate": 2e-05, "loss": 0.7767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1985, "tokens_per_second_per_gpu": 17785.32, "total_tokens": 196062893 }, { "epoch": 0.12415603900975244, "grad_norm": 0.974294126033783, "learning_rate": 2e-05, "loss": 0.7643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1986, "tokens_per_second_per_gpu": 15870.88, "total_tokens": 196154961 }, { "epoch": 0.12421855463865966, "grad_norm": 0.9123852849006653, "learning_rate": 2e-05, "loss": 0.7534, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1987, "tokens_per_second_per_gpu": 16983.81, "total_tokens": 196251516 }, { "epoch": 0.12428107026756689, "grad_norm": 2.9516005516052246, "learning_rate": 2e-05, "loss": 0.7419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1988, "tokens_per_second_per_gpu": 17367.76, "total_tokens": 196346761 }, { "epoch": 0.12434358589647412, "grad_norm": 0.9407833814620972, "learning_rate": 2e-05, "loss": 0.7547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1989, "tokens_per_second_per_gpu": 17526.98, "total_tokens": 196444056 }, { "epoch": 0.12440610152538134, "grad_norm": 0.9331375360488892, "learning_rate": 2e-05, "loss": 0.7688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1990, "tokens_per_second_per_gpu": 17590.06, "total_tokens": 196542367 }, { "epoch": 0.12446861715428857, "grad_norm": 0.9183320999145508, "learning_rate": 2e-05, "loss": 0.815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1991, "tokens_per_second_per_gpu": 17906.99, "total_tokens": 196645316 }, { "epoch": 0.1245311327831958, "grad_norm": 0.920091450214386, "learning_rate": 2e-05, "loss": 0.8009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1992, "tokens_per_second_per_gpu": 18346.28, "total_tokens": 196748590 }, { "epoch": 0.12459364841210302, "grad_norm": 0.9070391058921814, "learning_rate": 2e-05, "loss": 0.779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1993, "tokens_per_second_per_gpu": 18717.34, "total_tokens": 196852395 }, { "epoch": 0.12465616404101025, "grad_norm": 0.9446155428886414, "learning_rate": 2e-05, "loss": 0.767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1994, "tokens_per_second_per_gpu": 17925.36, "total_tokens": 196953499 }, { "epoch": 0.12471867966991748, "grad_norm": 0.9130715727806091, "learning_rate": 2e-05, "loss": 0.745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1995, "tokens_per_second_per_gpu": 16846.54, "total_tokens": 197048586 }, { "epoch": 0.12478119529882471, "grad_norm": 0.9650915861129761, "learning_rate": 2e-05, "loss": 0.8018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1996, "tokens_per_second_per_gpu": 16761.6, "total_tokens": 197143062 }, { "epoch": 0.12484371092773193, "grad_norm": 0.9651380181312561, "learning_rate": 2e-05, "loss": 0.7768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1997, "tokens_per_second_per_gpu": 16862.84, "total_tokens": 197241238 }, { "epoch": 0.12490622655663916, "grad_norm": 0.8905885219573975, "learning_rate": 2e-05, "loss": 0.7617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1998, "tokens_per_second_per_gpu": 17702.28, "total_tokens": 197341936 }, { "epoch": 0.12496874218554639, "grad_norm": 0.92702716588974, "learning_rate": 2e-05, "loss": 0.7632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 1999, "tokens_per_second_per_gpu": 17127.97, "total_tokens": 197438404 }, { "epoch": 0.12503125781445362, "grad_norm": 0.9281578660011292, "learning_rate": 2e-05, "loss": 0.7943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2000, "tokens_per_second_per_gpu": 17091.09, "total_tokens": 197536178 }, { "epoch": 0.12509377344336084, "grad_norm": 1.0314701795578003, "learning_rate": 2e-05, "loss": 0.7433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2001, "tokens_per_second_per_gpu": 16578.6, "total_tokens": 197627911 }, { "epoch": 0.12515628907226806, "grad_norm": 1.0356688499450684, "learning_rate": 2e-05, "loss": 0.7906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2002, "tokens_per_second_per_gpu": 16976.41, "total_tokens": 197726369 }, { "epoch": 0.1252188047011753, "grad_norm": 0.9420995712280273, "learning_rate": 2e-05, "loss": 0.7715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2003, "tokens_per_second_per_gpu": 17248.95, "total_tokens": 197821585 }, { "epoch": 0.12528132033008252, "grad_norm": 0.985230028629303, "learning_rate": 2e-05, "loss": 0.7013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2004, "tokens_per_second_per_gpu": 15871.9, "total_tokens": 197913496 }, { "epoch": 0.12534383595898974, "grad_norm": 1.0104093551635742, "learning_rate": 2e-05, "loss": 0.7715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2005, "tokens_per_second_per_gpu": 16574.45, "total_tokens": 198004339 }, { "epoch": 0.12540635158789698, "grad_norm": 0.9085330367088318, "learning_rate": 2e-05, "loss": 0.7573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2006, "tokens_per_second_per_gpu": 17166.37, "total_tokens": 198102036 }, { "epoch": 0.1254688672168042, "grad_norm": 1.0122125148773193, "learning_rate": 2e-05, "loss": 0.7397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2007, "tokens_per_second_per_gpu": 16396.14, "total_tokens": 198195680 }, { "epoch": 0.12553138284571141, "grad_norm": 1.0368728637695312, "learning_rate": 2e-05, "loss": 0.7916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2008, "tokens_per_second_per_gpu": 17813.97, "total_tokens": 198292849 }, { "epoch": 0.12559389847461866, "grad_norm": 0.9183985590934753, "learning_rate": 2e-05, "loss": 0.7793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2009, "tokens_per_second_per_gpu": 17388.22, "total_tokens": 198392386 }, { "epoch": 0.12565641410352588, "grad_norm": 1.002765417098999, "learning_rate": 2e-05, "loss": 0.7681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2010, "tokens_per_second_per_gpu": 16826.41, "total_tokens": 198490619 }, { "epoch": 0.12571892973243312, "grad_norm": 0.9739038348197937, "learning_rate": 2e-05, "loss": 0.8178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2011, "tokens_per_second_per_gpu": 17339.79, "total_tokens": 198589867 }, { "epoch": 0.12578144536134034, "grad_norm": 0.9393935203552246, "learning_rate": 2e-05, "loss": 0.7738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2012, "tokens_per_second_per_gpu": 16258.43, "total_tokens": 198687682 }, { "epoch": 0.12584396099024756, "grad_norm": 0.9151747226715088, "learning_rate": 2e-05, "loss": 0.7499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2013, "tokens_per_second_per_gpu": 17461.25, "total_tokens": 198787072 }, { "epoch": 0.1259064766191548, "grad_norm": 0.958083987236023, "learning_rate": 2e-05, "loss": 0.7519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2014, "tokens_per_second_per_gpu": 16626.52, "total_tokens": 198884395 }, { "epoch": 0.12596899224806202, "grad_norm": 0.9182729125022888, "learning_rate": 2e-05, "loss": 0.7639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2015, "tokens_per_second_per_gpu": 17851.21, "total_tokens": 198986527 }, { "epoch": 0.12603150787696923, "grad_norm": 0.9443677067756653, "learning_rate": 2e-05, "loss": 0.744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2016, "tokens_per_second_per_gpu": 16590.12, "total_tokens": 199082460 }, { "epoch": 0.12609402350587648, "grad_norm": 0.9859200119972229, "learning_rate": 2e-05, "loss": 0.7753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2017, "tokens_per_second_per_gpu": 16964.03, "total_tokens": 199180779 }, { "epoch": 0.1261565391347837, "grad_norm": 0.9465273022651672, "learning_rate": 2e-05, "loss": 0.7902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2018, "tokens_per_second_per_gpu": 18549.16, "total_tokens": 199283143 }, { "epoch": 0.1262190547636909, "grad_norm": 0.97205650806427, "learning_rate": 2e-05, "loss": 0.7518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2019, "tokens_per_second_per_gpu": 16989.81, "total_tokens": 199377326 }, { "epoch": 0.12628157039259816, "grad_norm": 0.9801228046417236, "learning_rate": 2e-05, "loss": 0.7312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2020, "tokens_per_second_per_gpu": 16029.17, "total_tokens": 199465017 }, { "epoch": 0.12634408602150538, "grad_norm": 0.9750827550888062, "learning_rate": 2e-05, "loss": 0.7851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2021, "tokens_per_second_per_gpu": 17345.21, "total_tokens": 199564978 }, { "epoch": 0.1264066016504126, "grad_norm": 0.9839730858802795, "learning_rate": 2e-05, "loss": 0.7624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2022, "tokens_per_second_per_gpu": 16003.44, "total_tokens": 199660092 }, { "epoch": 0.12646911727931984, "grad_norm": 1.1127375364303589, "learning_rate": 2e-05, "loss": 0.7627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2023, "tokens_per_second_per_gpu": 15712.29, "total_tokens": 199751304 }, { "epoch": 0.12653163290822705, "grad_norm": 0.951545238494873, "learning_rate": 2e-05, "loss": 0.799, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2024, "tokens_per_second_per_gpu": 17194.08, "total_tokens": 199850131 }, { "epoch": 0.12659414853713427, "grad_norm": 1.0051980018615723, "learning_rate": 2e-05, "loss": 0.7795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2025, "tokens_per_second_per_gpu": 17079.09, "total_tokens": 199946443 }, { "epoch": 0.12665666416604152, "grad_norm": 0.9808390736579895, "learning_rate": 2e-05, "loss": 0.7762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2026, "tokens_per_second_per_gpu": 18398.71, "total_tokens": 200051021 }, { "epoch": 0.12671917979494873, "grad_norm": 0.9607844948768616, "learning_rate": 2e-05, "loss": 0.7462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2027, "tokens_per_second_per_gpu": 15939.14, "total_tokens": 200145246 }, { "epoch": 0.12678169542385595, "grad_norm": 0.9601980447769165, "learning_rate": 2e-05, "loss": 0.7778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2028, "tokens_per_second_per_gpu": 16765.94, "total_tokens": 200240881 }, { "epoch": 0.1268442110527632, "grad_norm": 0.9636896848678589, "learning_rate": 2e-05, "loss": 0.8476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2029, "tokens_per_second_per_gpu": 18911.1, "total_tokens": 200344129 }, { "epoch": 0.1269067266816704, "grad_norm": 0.999422013759613, "learning_rate": 2e-05, "loss": 0.7798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2030, "tokens_per_second_per_gpu": 17203.42, "total_tokens": 200441791 }, { "epoch": 0.12696924231057766, "grad_norm": 1.0529823303222656, "learning_rate": 2e-05, "loss": 0.8147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2031, "tokens_per_second_per_gpu": 17212.2, "total_tokens": 200539290 }, { "epoch": 0.12703175793948487, "grad_norm": 0.9489736557006836, "learning_rate": 2e-05, "loss": 0.7643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2032, "tokens_per_second_per_gpu": 17253.6, "total_tokens": 200635560 }, { "epoch": 0.1270942735683921, "grad_norm": 0.9875903129577637, "learning_rate": 2e-05, "loss": 0.7449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2033, "tokens_per_second_per_gpu": 17026.35, "total_tokens": 200733371 }, { "epoch": 0.12715678919729934, "grad_norm": 0.9474966526031494, "learning_rate": 2e-05, "loss": 0.763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2034, "tokens_per_second_per_gpu": 16710.89, "total_tokens": 200830095 }, { "epoch": 0.12721930482620655, "grad_norm": 0.915963888168335, "learning_rate": 2e-05, "loss": 0.7583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2035, "tokens_per_second_per_gpu": 15459.73, "total_tokens": 200924162 }, { "epoch": 0.12728182045511377, "grad_norm": 0.9416934847831726, "learning_rate": 2e-05, "loss": 0.7658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2036, "tokens_per_second_per_gpu": 17097.3, "total_tokens": 201022001 }, { "epoch": 0.12734433608402101, "grad_norm": 0.9904340505599976, "learning_rate": 2e-05, "loss": 0.7472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2037, "tokens_per_second_per_gpu": 16568.9, "total_tokens": 201116945 }, { "epoch": 0.12740685171292823, "grad_norm": 0.9358832836151123, "learning_rate": 2e-05, "loss": 0.7578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2038, "tokens_per_second_per_gpu": 18014.85, "total_tokens": 201216318 }, { "epoch": 0.12746936734183545, "grad_norm": 0.958410918712616, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2039, "tokens_per_second_per_gpu": 15535.19, "total_tokens": 201305781 }, { "epoch": 0.1275318829707427, "grad_norm": 0.9349533915519714, "learning_rate": 2e-05, "loss": 0.8039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2040, "tokens_per_second_per_gpu": 17800.0, "total_tokens": 201405450 }, { "epoch": 0.1275943985996499, "grad_norm": 0.9877825379371643, "learning_rate": 2e-05, "loss": 0.7446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2041, "tokens_per_second_per_gpu": 17187.77, "total_tokens": 201500213 }, { "epoch": 0.12765691422855713, "grad_norm": 0.9115051031112671, "learning_rate": 2e-05, "loss": 0.7728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2042, "tokens_per_second_per_gpu": 17608.41, "total_tokens": 201600843 }, { "epoch": 0.12771942985746437, "grad_norm": 0.9280492663383484, "learning_rate": 2e-05, "loss": 0.7702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2043, "tokens_per_second_per_gpu": 16910.43, "total_tokens": 201697472 }, { "epoch": 0.1277819454863716, "grad_norm": 0.9322593808174133, "learning_rate": 2e-05, "loss": 0.8116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2044, "tokens_per_second_per_gpu": 17929.29, "total_tokens": 201796562 }, { "epoch": 0.1278444611152788, "grad_norm": 0.921146810054779, "learning_rate": 2e-05, "loss": 0.7395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2045, "tokens_per_second_per_gpu": 16992.72, "total_tokens": 201895968 }, { "epoch": 0.12790697674418605, "grad_norm": 0.938460111618042, "learning_rate": 2e-05, "loss": 0.7719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2046, "tokens_per_second_per_gpu": 17320.41, "total_tokens": 201993319 }, { "epoch": 0.12796949237309327, "grad_norm": 0.9249254465103149, "learning_rate": 2e-05, "loss": 0.7567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2047, "tokens_per_second_per_gpu": 16556.93, "total_tokens": 202089986 }, { "epoch": 0.1280320080020005, "grad_norm": 0.9747976064682007, "learning_rate": 2e-05, "loss": 0.814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2048, "tokens_per_second_per_gpu": 16106.45, "total_tokens": 202180172 }, { "epoch": 0.12809452363090773, "grad_norm": 0.9197193384170532, "learning_rate": 2e-05, "loss": 0.7497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2049, "tokens_per_second_per_gpu": 17300.35, "total_tokens": 202276793 }, { "epoch": 0.12815703925981495, "grad_norm": 0.9510537385940552, "learning_rate": 2e-05, "loss": 0.7493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2050, "tokens_per_second_per_gpu": 17174.93, "total_tokens": 202371502 }, { "epoch": 0.1282195548887222, "grad_norm": 0.9104939103126526, "learning_rate": 2e-05, "loss": 0.7898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2051, "tokens_per_second_per_gpu": 18450.03, "total_tokens": 202474405 }, { "epoch": 0.1282820705176294, "grad_norm": 0.967218816280365, "learning_rate": 2e-05, "loss": 0.7454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2052, "tokens_per_second_per_gpu": 17700.0, "total_tokens": 202570538 }, { "epoch": 0.12834458614653663, "grad_norm": 0.9521265625953674, "learning_rate": 2e-05, "loss": 0.7916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2053, "tokens_per_second_per_gpu": 17271.66, "total_tokens": 202667457 }, { "epoch": 0.12840710177544387, "grad_norm": 0.9604955911636353, "learning_rate": 2e-05, "loss": 0.7892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2054, "tokens_per_second_per_gpu": 17792.75, "total_tokens": 202768176 }, { "epoch": 0.1284696174043511, "grad_norm": 0.9467065334320068, "learning_rate": 2e-05, "loss": 0.7479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2055, "tokens_per_second_per_gpu": 17411.05, "total_tokens": 202867997 }, { "epoch": 0.1285321330332583, "grad_norm": 0.9734957814216614, "learning_rate": 2e-05, "loss": 0.7248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2056, "tokens_per_second_per_gpu": 17463.12, "total_tokens": 202960329 }, { "epoch": 0.12859464866216555, "grad_norm": 0.9548068642616272, "learning_rate": 2e-05, "loss": 0.7452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2057, "tokens_per_second_per_gpu": 17007.12, "total_tokens": 203057514 }, { "epoch": 0.12865716429107277, "grad_norm": 0.9127984642982483, "learning_rate": 2e-05, "loss": 0.7547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2058, "tokens_per_second_per_gpu": 17180.46, "total_tokens": 203156633 }, { "epoch": 0.12871967991997998, "grad_norm": 0.9410499334335327, "learning_rate": 2e-05, "loss": 0.7706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2059, "tokens_per_second_per_gpu": 16514.95, "total_tokens": 203250823 }, { "epoch": 0.12878219554888723, "grad_norm": 0.9285303354263306, "learning_rate": 2e-05, "loss": 0.7418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2060, "tokens_per_second_per_gpu": 17032.75, "total_tokens": 203345881 }, { "epoch": 0.12884471117779445, "grad_norm": 0.8962650299072266, "learning_rate": 2e-05, "loss": 0.7572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2061, "tokens_per_second_per_gpu": 16547.77, "total_tokens": 203441747 }, { "epoch": 0.12890722680670166, "grad_norm": 0.9049785733222961, "learning_rate": 2e-05, "loss": 0.7529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2062, "tokens_per_second_per_gpu": 17871.52, "total_tokens": 203540779 }, { "epoch": 0.1289697424356089, "grad_norm": 0.8817489147186279, "learning_rate": 2e-05, "loss": 0.6626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2063, "tokens_per_second_per_gpu": 16727.59, "total_tokens": 203632071 }, { "epoch": 0.12903225806451613, "grad_norm": 0.9572360515594482, "learning_rate": 2e-05, "loss": 0.738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2064, "tokens_per_second_per_gpu": 15599.17, "total_tokens": 203724626 }, { "epoch": 0.12909477369342334, "grad_norm": 0.9793360233306885, "learning_rate": 2e-05, "loss": 0.7428, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2065, "tokens_per_second_per_gpu": 17570.81, "total_tokens": 203821497 }, { "epoch": 0.1291572893223306, "grad_norm": 0.9524911642074585, "learning_rate": 2e-05, "loss": 0.7519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2066, "tokens_per_second_per_gpu": 17192.08, "total_tokens": 203919467 }, { "epoch": 0.1292198049512378, "grad_norm": 0.92225581407547, "learning_rate": 2e-05, "loss": 0.7458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2067, "tokens_per_second_per_gpu": 16700.1, "total_tokens": 204015257 }, { "epoch": 0.12928232058014505, "grad_norm": 1.0131402015686035, "learning_rate": 2e-05, "loss": 0.7739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2068, "tokens_per_second_per_gpu": 16700.36, "total_tokens": 204110916 }, { "epoch": 0.12934483620905227, "grad_norm": 0.9069336652755737, "learning_rate": 2e-05, "loss": 0.72, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2069, "tokens_per_second_per_gpu": 17225.16, "total_tokens": 204209431 }, { "epoch": 0.12940735183795948, "grad_norm": 0.9523788094520569, "learning_rate": 2e-05, "loss": 0.7991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2070, "tokens_per_second_per_gpu": 16853.23, "total_tokens": 204306816 }, { "epoch": 0.12946986746686673, "grad_norm": 0.9236236214637756, "learning_rate": 2e-05, "loss": 0.7677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2071, "tokens_per_second_per_gpu": 16969.88, "total_tokens": 204404368 }, { "epoch": 0.12953238309577395, "grad_norm": 0.9344677329063416, "learning_rate": 2e-05, "loss": 0.7869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2072, "tokens_per_second_per_gpu": 16921.4, "total_tokens": 204503547 }, { "epoch": 0.12959489872468116, "grad_norm": 0.9251070022583008, "learning_rate": 2e-05, "loss": 0.7423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2073, "tokens_per_second_per_gpu": 15795.67, "total_tokens": 204597492 }, { "epoch": 0.1296574143535884, "grad_norm": 0.9323503375053406, "learning_rate": 2e-05, "loss": 0.773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2074, "tokens_per_second_per_gpu": 16914.1, "total_tokens": 204695400 }, { "epoch": 0.12971992998249562, "grad_norm": 0.94776850938797, "learning_rate": 2e-05, "loss": 0.763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2075, "tokens_per_second_per_gpu": 16202.82, "total_tokens": 204788938 }, { "epoch": 0.12978244561140284, "grad_norm": 0.9451532363891602, "learning_rate": 2e-05, "loss": 0.8004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2076, "tokens_per_second_per_gpu": 16238.13, "total_tokens": 204881957 }, { "epoch": 0.1298449612403101, "grad_norm": 0.913089394569397, "learning_rate": 2e-05, "loss": 0.7497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2077, "tokens_per_second_per_gpu": 17591.64, "total_tokens": 204981631 }, { "epoch": 0.1299074768692173, "grad_norm": 0.9476186037063599, "learning_rate": 2e-05, "loss": 0.7567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2078, "tokens_per_second_per_gpu": 16755.57, "total_tokens": 205078630 }, { "epoch": 0.12996999249812452, "grad_norm": 1.4235743284225464, "learning_rate": 2e-05, "loss": 0.7606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2079, "tokens_per_second_per_gpu": 16787.8, "total_tokens": 205176489 }, { "epoch": 0.13003250812703177, "grad_norm": 0.9838753342628479, "learning_rate": 2e-05, "loss": 0.7443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2080, "tokens_per_second_per_gpu": 17364.86, "total_tokens": 205269445 }, { "epoch": 0.13009502375593898, "grad_norm": 0.94709312915802, "learning_rate": 2e-05, "loss": 0.7536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2081, "tokens_per_second_per_gpu": 17460.62, "total_tokens": 205368390 }, { "epoch": 0.1301575393848462, "grad_norm": 0.9872872829437256, "learning_rate": 2e-05, "loss": 0.7905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2082, "tokens_per_second_per_gpu": 16335.0, "total_tokens": 205464130 }, { "epoch": 0.13022005501375344, "grad_norm": 0.9643244743347168, "learning_rate": 2e-05, "loss": 0.7487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2083, "tokens_per_second_per_gpu": 16864.64, "total_tokens": 205561784 }, { "epoch": 0.13028257064266066, "grad_norm": 1.051395297050476, "learning_rate": 2e-05, "loss": 0.8004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2084, "tokens_per_second_per_gpu": 17985.16, "total_tokens": 205660215 }, { "epoch": 0.13034508627156788, "grad_norm": 0.9856132864952087, "learning_rate": 2e-05, "loss": 0.7676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2085, "tokens_per_second_per_gpu": 16957.11, "total_tokens": 205758953 }, { "epoch": 0.13040760190047512, "grad_norm": 0.9891558289527893, "learning_rate": 2e-05, "loss": 0.7384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2086, "tokens_per_second_per_gpu": 15194.24, "total_tokens": 205855607 }, { "epoch": 0.13047011752938234, "grad_norm": 0.9593833088874817, "learning_rate": 2e-05, "loss": 0.7938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2087, "tokens_per_second_per_gpu": 17251.2, "total_tokens": 205952598 }, { "epoch": 0.13053263315828958, "grad_norm": 0.9782513976097107, "learning_rate": 2e-05, "loss": 0.7383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2088, "tokens_per_second_per_gpu": 16212.94, "total_tokens": 206044880 }, { "epoch": 0.1305951487871968, "grad_norm": 0.9681912660598755, "learning_rate": 2e-05, "loss": 0.8047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2089, "tokens_per_second_per_gpu": 17679.32, "total_tokens": 206141444 }, { "epoch": 0.13065766441610402, "grad_norm": 1.0044914484024048, "learning_rate": 2e-05, "loss": 0.7162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2090, "tokens_per_second_per_gpu": 15668.63, "total_tokens": 206230724 }, { "epoch": 0.13072018004501126, "grad_norm": 1.0670229196548462, "learning_rate": 2e-05, "loss": 0.7681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2091, "tokens_per_second_per_gpu": 17338.56, "total_tokens": 206331083 }, { "epoch": 0.13078269567391848, "grad_norm": 0.9126270413398743, "learning_rate": 2e-05, "loss": 0.7483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2092, "tokens_per_second_per_gpu": 18500.56, "total_tokens": 206431297 }, { "epoch": 0.1308452113028257, "grad_norm": 0.9168487787246704, "learning_rate": 2e-05, "loss": 0.7834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2093, "tokens_per_second_per_gpu": 17980.56, "total_tokens": 206535448 }, { "epoch": 0.13090772693173294, "grad_norm": 0.967918872833252, "learning_rate": 2e-05, "loss": 0.7466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2094, "tokens_per_second_per_gpu": 17369.39, "total_tokens": 206638500 }, { "epoch": 0.13097024256064016, "grad_norm": 0.936130166053772, "learning_rate": 2e-05, "loss": 0.7925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2095, "tokens_per_second_per_gpu": 17110.9, "total_tokens": 206737379 }, { "epoch": 0.13103275818954738, "grad_norm": 0.9334095120429993, "learning_rate": 2e-05, "loss": 0.7706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2096, "tokens_per_second_per_gpu": 17890.58, "total_tokens": 206836742 }, { "epoch": 0.13109527381845462, "grad_norm": 0.934170663356781, "learning_rate": 2e-05, "loss": 0.7851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2097, "tokens_per_second_per_gpu": 17543.2, "total_tokens": 206935603 }, { "epoch": 0.13115778944736184, "grad_norm": 0.9417128562927246, "learning_rate": 2e-05, "loss": 0.7422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2098, "tokens_per_second_per_gpu": 18054.67, "total_tokens": 207037206 }, { "epoch": 0.13122030507626906, "grad_norm": 0.9326011538505554, "learning_rate": 2e-05, "loss": 0.7536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2099, "tokens_per_second_per_gpu": 18746.27, "total_tokens": 207136383 }, { "epoch": 0.1312828207051763, "grad_norm": 0.9297819137573242, "learning_rate": 2e-05, "loss": 0.762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2100, "tokens_per_second_per_gpu": 18327.66, "total_tokens": 207240578 }, { "epoch": 0.13134533633408352, "grad_norm": 1.008277177810669, "learning_rate": 2e-05, "loss": 0.7563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2101, "tokens_per_second_per_gpu": 17120.08, "total_tokens": 207340754 }, { "epoch": 0.13140785196299073, "grad_norm": 1.013741374015808, "learning_rate": 2e-05, "loss": 0.8383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2102, "tokens_per_second_per_gpu": 18667.38, "total_tokens": 207442904 }, { "epoch": 0.13147036759189798, "grad_norm": 0.9071624875068665, "learning_rate": 2e-05, "loss": 0.7585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2103, "tokens_per_second_per_gpu": 17675.29, "total_tokens": 207546846 }, { "epoch": 0.1315328832208052, "grad_norm": 0.9820895195007324, "learning_rate": 2e-05, "loss": 0.7728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2104, "tokens_per_second_per_gpu": 17342.15, "total_tokens": 207644218 }, { "epoch": 0.13159539884971244, "grad_norm": 0.9628484845161438, "learning_rate": 2e-05, "loss": 0.7404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2105, "tokens_per_second_per_gpu": 16289.11, "total_tokens": 207741019 }, { "epoch": 0.13165791447861966, "grad_norm": 0.8912862539291382, "learning_rate": 2e-05, "loss": 0.7597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2106, "tokens_per_second_per_gpu": 17479.99, "total_tokens": 207840307 }, { "epoch": 0.13172043010752688, "grad_norm": 0.9885125756263733, "learning_rate": 2e-05, "loss": 0.7282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2107, "tokens_per_second_per_gpu": 16598.69, "total_tokens": 207933851 }, { "epoch": 0.13178294573643412, "grad_norm": 0.9867501258850098, "learning_rate": 2e-05, "loss": 0.7808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2108, "tokens_per_second_per_gpu": 17872.82, "total_tokens": 208035360 }, { "epoch": 0.13184546136534134, "grad_norm": 0.9157252907752991, "learning_rate": 2e-05, "loss": 0.7541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2109, "tokens_per_second_per_gpu": 17873.26, "total_tokens": 208135865 }, { "epoch": 0.13190797699424855, "grad_norm": 0.9029736518859863, "learning_rate": 2e-05, "loss": 0.722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2110, "tokens_per_second_per_gpu": 17534.9, "total_tokens": 208234318 }, { "epoch": 0.1319704926231558, "grad_norm": 0.9101777672767639, "learning_rate": 2e-05, "loss": 0.7539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2111, "tokens_per_second_per_gpu": 17029.59, "total_tokens": 208330655 }, { "epoch": 0.13203300825206302, "grad_norm": 0.9895868301391602, "learning_rate": 2e-05, "loss": 0.7508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2112, "tokens_per_second_per_gpu": 17440.42, "total_tokens": 208430056 }, { "epoch": 0.13209552388097023, "grad_norm": 0.9462359547615051, "learning_rate": 2e-05, "loss": 0.7457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2113, "tokens_per_second_per_gpu": 17313.92, "total_tokens": 208525222 }, { "epoch": 0.13215803950987748, "grad_norm": 0.9177573919296265, "learning_rate": 2e-05, "loss": 0.7814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2114, "tokens_per_second_per_gpu": 18192.01, "total_tokens": 208627593 }, { "epoch": 0.1322205551387847, "grad_norm": 0.9180757403373718, "learning_rate": 2e-05, "loss": 0.7587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2115, "tokens_per_second_per_gpu": 17667.36, "total_tokens": 208725254 }, { "epoch": 0.1322830707676919, "grad_norm": 0.9265543818473816, "learning_rate": 2e-05, "loss": 0.7279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2116, "tokens_per_second_per_gpu": 17563.53, "total_tokens": 208822811 }, { "epoch": 0.13234558639659916, "grad_norm": 0.9181976914405823, "learning_rate": 2e-05, "loss": 0.7731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2117, "tokens_per_second_per_gpu": 17255.59, "total_tokens": 208923339 }, { "epoch": 0.13240810202550637, "grad_norm": 0.9496869444847107, "learning_rate": 2e-05, "loss": 0.8138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2118, "tokens_per_second_per_gpu": 18681.36, "total_tokens": 209028869 }, { "epoch": 0.1324706176544136, "grad_norm": 0.9969913959503174, "learning_rate": 2e-05, "loss": 0.7178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2119, "tokens_per_second_per_gpu": 16520.25, "total_tokens": 209125115 }, { "epoch": 0.13253313328332084, "grad_norm": 0.9176998734474182, "learning_rate": 2e-05, "loss": 0.7662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2120, "tokens_per_second_per_gpu": 17863.24, "total_tokens": 209224568 }, { "epoch": 0.13259564891222805, "grad_norm": 0.9589657783508301, "learning_rate": 2e-05, "loss": 0.7974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2121, "tokens_per_second_per_gpu": 18685.89, "total_tokens": 209328036 }, { "epoch": 0.13265816454113527, "grad_norm": 0.9712933897972107, "learning_rate": 2e-05, "loss": 0.7129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2122, "tokens_per_second_per_gpu": 17685.17, "total_tokens": 209424814 }, { "epoch": 0.13272068017004252, "grad_norm": 0.9065782427787781, "learning_rate": 2e-05, "loss": 0.7082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2123, "tokens_per_second_per_gpu": 17621.2, "total_tokens": 209521986 }, { "epoch": 0.13278319579894973, "grad_norm": 0.9306748509407043, "learning_rate": 2e-05, "loss": 0.7772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2124, "tokens_per_second_per_gpu": 16891.24, "total_tokens": 209620351 }, { "epoch": 0.13284571142785698, "grad_norm": 0.9400573968887329, "learning_rate": 2e-05, "loss": 0.6819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2125, "tokens_per_second_per_gpu": 16353.31, "total_tokens": 209713252 }, { "epoch": 0.1329082270567642, "grad_norm": 1.000241756439209, "learning_rate": 2e-05, "loss": 0.7489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2126, "tokens_per_second_per_gpu": 18068.56, "total_tokens": 209814483 }, { "epoch": 0.1329707426856714, "grad_norm": 0.9217177629470825, "learning_rate": 2e-05, "loss": 0.7251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2127, "tokens_per_second_per_gpu": 16954.73, "total_tokens": 209914868 }, { "epoch": 0.13303325831457866, "grad_norm": 0.9439372420310974, "learning_rate": 2e-05, "loss": 0.7749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2128, "tokens_per_second_per_gpu": 16670.61, "total_tokens": 210009702 }, { "epoch": 0.13309577394348587, "grad_norm": 1.0002511739730835, "learning_rate": 2e-05, "loss": 0.7655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2129, "tokens_per_second_per_gpu": 17615.1, "total_tokens": 210110642 }, { "epoch": 0.1331582895723931, "grad_norm": 0.9271626472473145, "learning_rate": 2e-05, "loss": 0.7224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2130, "tokens_per_second_per_gpu": 16968.76, "total_tokens": 210207646 }, { "epoch": 0.13322080520130034, "grad_norm": 0.9074277877807617, "learning_rate": 2e-05, "loss": 0.7368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2131, "tokens_per_second_per_gpu": 17278.57, "total_tokens": 210308329 }, { "epoch": 0.13328332083020755, "grad_norm": 0.9241006374359131, "learning_rate": 2e-05, "loss": 0.8034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2132, "tokens_per_second_per_gpu": 17866.84, "total_tokens": 210411120 }, { "epoch": 0.13334583645911477, "grad_norm": 0.9883346557617188, "learning_rate": 2e-05, "loss": 0.7644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2133, "tokens_per_second_per_gpu": 17550.31, "total_tokens": 210507477 }, { "epoch": 0.13340835208802201, "grad_norm": 0.9462481737136841, "learning_rate": 2e-05, "loss": 0.7547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2134, "tokens_per_second_per_gpu": 16477.43, "total_tokens": 210603541 }, { "epoch": 0.13347086771692923, "grad_norm": 0.9258906841278076, "learning_rate": 2e-05, "loss": 0.6975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2135, "tokens_per_second_per_gpu": 17131.13, "total_tokens": 210701265 }, { "epoch": 0.13353338334583645, "grad_norm": 0.9051150679588318, "learning_rate": 2e-05, "loss": 0.7764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2136, "tokens_per_second_per_gpu": 18261.55, "total_tokens": 210802126 }, { "epoch": 0.1335958989747437, "grad_norm": 0.9174699187278748, "learning_rate": 2e-05, "loss": 0.7749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2137, "tokens_per_second_per_gpu": 17157.88, "total_tokens": 210904407 }, { "epoch": 0.1336584146036509, "grad_norm": 0.9791237115859985, "learning_rate": 2e-05, "loss": 0.8045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2138, "tokens_per_second_per_gpu": 17514.55, "total_tokens": 211003190 }, { "epoch": 0.13372093023255813, "grad_norm": 0.9562098979949951, "learning_rate": 2e-05, "loss": 0.7331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2139, "tokens_per_second_per_gpu": 17930.32, "total_tokens": 211102013 }, { "epoch": 0.13378344586146537, "grad_norm": 0.9237320423126221, "learning_rate": 2e-05, "loss": 0.7549, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2140, "tokens_per_second_per_gpu": 16682.44, "total_tokens": 211203093 }, { "epoch": 0.1338459614903726, "grad_norm": 0.9059449434280396, "learning_rate": 2e-05, "loss": 0.7729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2141, "tokens_per_second_per_gpu": 17984.08, "total_tokens": 211304391 }, { "epoch": 0.1339084771192798, "grad_norm": 0.9132794737815857, "learning_rate": 2e-05, "loss": 0.7848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2142, "tokens_per_second_per_gpu": 17875.41, "total_tokens": 211408261 }, { "epoch": 0.13397099274818705, "grad_norm": 0.9789265394210815, "learning_rate": 2e-05, "loss": 0.7565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2143, "tokens_per_second_per_gpu": 17916.04, "total_tokens": 211506523 }, { "epoch": 0.13403350837709427, "grad_norm": 0.9660510420799255, "learning_rate": 2e-05, "loss": 0.7335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2144, "tokens_per_second_per_gpu": 17487.83, "total_tokens": 211605870 }, { "epoch": 0.1340960240060015, "grad_norm": 0.9982309341430664, "learning_rate": 2e-05, "loss": 0.7855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2145, "tokens_per_second_per_gpu": 17063.39, "total_tokens": 211706156 }, { "epoch": 0.13415853963490873, "grad_norm": 0.9367470741271973, "learning_rate": 2e-05, "loss": 0.7413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2146, "tokens_per_second_per_gpu": 17651.78, "total_tokens": 211808966 }, { "epoch": 0.13422105526381595, "grad_norm": 0.8913178443908691, "learning_rate": 2e-05, "loss": 0.7676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2147, "tokens_per_second_per_gpu": 17704.92, "total_tokens": 211912454 }, { "epoch": 0.1342835708927232, "grad_norm": 0.9112251400947571, "learning_rate": 2e-05, "loss": 0.7277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2148, "tokens_per_second_per_gpu": 18305.59, "total_tokens": 212011551 }, { "epoch": 0.1343460865216304, "grad_norm": 0.9462031722068787, "learning_rate": 2e-05, "loss": 0.7191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2149, "tokens_per_second_per_gpu": 16780.82, "total_tokens": 212110274 }, { "epoch": 0.13440860215053763, "grad_norm": 0.934347927570343, "learning_rate": 2e-05, "loss": 0.7649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2150, "tokens_per_second_per_gpu": 17017.96, "total_tokens": 212208613 }, { "epoch": 0.13447111777944487, "grad_norm": 0.9302058219909668, "learning_rate": 2e-05, "loss": 0.7509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2151, "tokens_per_second_per_gpu": 16991.4, "total_tokens": 212303919 }, { "epoch": 0.1345336334083521, "grad_norm": 0.9700202345848083, "learning_rate": 2e-05, "loss": 0.7522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2152, "tokens_per_second_per_gpu": 17302.18, "total_tokens": 212399767 }, { "epoch": 0.1345961490372593, "grad_norm": 0.8995731472969055, "learning_rate": 2e-05, "loss": 0.7278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2153, "tokens_per_second_per_gpu": 18414.69, "total_tokens": 212498723 }, { "epoch": 0.13465866466616655, "grad_norm": 0.9020694494247437, "learning_rate": 2e-05, "loss": 0.7375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2154, "tokens_per_second_per_gpu": 18160.21, "total_tokens": 212598757 }, { "epoch": 0.13472118029507377, "grad_norm": 0.9454078674316406, "learning_rate": 2e-05, "loss": 0.7521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2155, "tokens_per_second_per_gpu": 17241.47, "total_tokens": 212697198 }, { "epoch": 0.13478369592398098, "grad_norm": 0.977353572845459, "learning_rate": 2e-05, "loss": 0.776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2156, "tokens_per_second_per_gpu": 17311.27, "total_tokens": 212796878 }, { "epoch": 0.13484621155288823, "grad_norm": 0.984397292137146, "learning_rate": 2e-05, "loss": 0.7278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2157, "tokens_per_second_per_gpu": 16287.93, "total_tokens": 212889286 }, { "epoch": 0.13490872718179545, "grad_norm": 0.967250406742096, "learning_rate": 2e-05, "loss": 0.7651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2158, "tokens_per_second_per_gpu": 17741.9, "total_tokens": 212984417 }, { "epoch": 0.13497124281070266, "grad_norm": 0.9608093500137329, "learning_rate": 2e-05, "loss": 0.7742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2159, "tokens_per_second_per_gpu": 16992.2, "total_tokens": 213079508 }, { "epoch": 0.1350337584396099, "grad_norm": 0.9308426976203918, "learning_rate": 2e-05, "loss": 0.7903, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2160, "tokens_per_second_per_gpu": 18514.25, "total_tokens": 213179381 }, { "epoch": 0.13509627406851712, "grad_norm": 0.9419776201248169, "learning_rate": 2e-05, "loss": 0.7363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2161, "tokens_per_second_per_gpu": 17268.32, "total_tokens": 213277897 }, { "epoch": 0.13515878969742437, "grad_norm": 0.9781303405761719, "learning_rate": 2e-05, "loss": 0.7422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2162, "tokens_per_second_per_gpu": 17744.91, "total_tokens": 213376323 }, { "epoch": 0.1352213053263316, "grad_norm": 0.8813769221305847, "learning_rate": 2e-05, "loss": 0.7227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2163, "tokens_per_second_per_gpu": 18021.14, "total_tokens": 213478242 }, { "epoch": 0.1352838209552388, "grad_norm": 0.8814067244529724, "learning_rate": 2e-05, "loss": 0.7133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2164, "tokens_per_second_per_gpu": 18354.62, "total_tokens": 213581921 }, { "epoch": 0.13534633658414605, "grad_norm": 0.922791600227356, "learning_rate": 2e-05, "loss": 0.7427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2165, "tokens_per_second_per_gpu": 16979.28, "total_tokens": 213679042 }, { "epoch": 0.13540885221305327, "grad_norm": 0.9511463642120361, "learning_rate": 2e-05, "loss": 0.784, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2166, "tokens_per_second_per_gpu": 16777.94, "total_tokens": 213777890 }, { "epoch": 0.13547136784196048, "grad_norm": 0.9500851631164551, "learning_rate": 2e-05, "loss": 0.7588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2167, "tokens_per_second_per_gpu": 17062.28, "total_tokens": 213874581 }, { "epoch": 0.13553388347086773, "grad_norm": 0.9037838578224182, "learning_rate": 2e-05, "loss": 0.743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2168, "tokens_per_second_per_gpu": 18601.0, "total_tokens": 213977641 }, { "epoch": 0.13559639909977494, "grad_norm": 0.9136397838592529, "learning_rate": 2e-05, "loss": 0.7327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2169, "tokens_per_second_per_gpu": 17375.94, "total_tokens": 214077449 }, { "epoch": 0.13565891472868216, "grad_norm": 1.0284371376037598, "learning_rate": 2e-05, "loss": 0.793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2170, "tokens_per_second_per_gpu": 17316.32, "total_tokens": 214178785 }, { "epoch": 0.1357214303575894, "grad_norm": 0.9285328388214111, "learning_rate": 2e-05, "loss": 0.7399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2171, "tokens_per_second_per_gpu": 17083.77, "total_tokens": 214280988 }, { "epoch": 0.13578394598649662, "grad_norm": 1.0033433437347412, "learning_rate": 2e-05, "loss": 0.7213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2172, "tokens_per_second_per_gpu": 16407.86, "total_tokens": 214374819 }, { "epoch": 0.13584646161540384, "grad_norm": 0.9982866644859314, "learning_rate": 2e-05, "loss": 0.7358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2173, "tokens_per_second_per_gpu": 16870.78, "total_tokens": 214470582 }, { "epoch": 0.13590897724431109, "grad_norm": 0.9234154224395752, "learning_rate": 2e-05, "loss": 0.7597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2174, "tokens_per_second_per_gpu": 17734.24, "total_tokens": 214571644 }, { "epoch": 0.1359714928732183, "grad_norm": 0.9434897899627686, "learning_rate": 2e-05, "loss": 0.772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2175, "tokens_per_second_per_gpu": 17332.03, "total_tokens": 214672293 }, { "epoch": 0.13603400850212552, "grad_norm": 0.953635036945343, "learning_rate": 2e-05, "loss": 0.7176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2176, "tokens_per_second_per_gpu": 17095.57, "total_tokens": 214770390 }, { "epoch": 0.13609652413103276, "grad_norm": 0.9346063733100891, "learning_rate": 2e-05, "loss": 0.7655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2177, "tokens_per_second_per_gpu": 18297.52, "total_tokens": 214872569 }, { "epoch": 0.13615903975993998, "grad_norm": 0.9134714603424072, "learning_rate": 2e-05, "loss": 0.7296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2178, "tokens_per_second_per_gpu": 16929.56, "total_tokens": 214971724 }, { "epoch": 0.1362215553888472, "grad_norm": 0.9143273234367371, "learning_rate": 2e-05, "loss": 0.7897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2179, "tokens_per_second_per_gpu": 18258.55, "total_tokens": 215075244 }, { "epoch": 0.13628407101775444, "grad_norm": 0.8858209252357483, "learning_rate": 2e-05, "loss": 0.7588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2180, "tokens_per_second_per_gpu": 18380.77, "total_tokens": 215178447 }, { "epoch": 0.13634658664666166, "grad_norm": 0.962825357913971, "learning_rate": 2e-05, "loss": 0.752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2181, "tokens_per_second_per_gpu": 17978.4, "total_tokens": 215282427 }, { "epoch": 0.1364091022755689, "grad_norm": 0.9106375575065613, "learning_rate": 2e-05, "loss": 0.7371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2182, "tokens_per_second_per_gpu": 18549.28, "total_tokens": 215381463 }, { "epoch": 0.13647161790447612, "grad_norm": 0.9353281855583191, "learning_rate": 2e-05, "loss": 0.7564, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2183, "tokens_per_second_per_gpu": 18755.14, "total_tokens": 215483631 }, { "epoch": 0.13653413353338334, "grad_norm": 0.9811944365501404, "learning_rate": 2e-05, "loss": 0.7776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2184, "tokens_per_second_per_gpu": 19303.35, "total_tokens": 215588191 }, { "epoch": 0.13659664916229058, "grad_norm": 0.929486870765686, "learning_rate": 2e-05, "loss": 0.7616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2185, "tokens_per_second_per_gpu": 17833.98, "total_tokens": 215690856 }, { "epoch": 0.1366591647911978, "grad_norm": 0.974057674407959, "learning_rate": 2e-05, "loss": 0.7362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2186, "tokens_per_second_per_gpu": 17547.75, "total_tokens": 215790476 }, { "epoch": 0.13672168042010502, "grad_norm": 0.9022216200828552, "learning_rate": 2e-05, "loss": 0.7492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2187, "tokens_per_second_per_gpu": 17178.44, "total_tokens": 215887911 }, { "epoch": 0.13678419604901226, "grad_norm": 0.9587225317955017, "learning_rate": 2e-05, "loss": 0.7507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2188, "tokens_per_second_per_gpu": 17223.79, "total_tokens": 215986362 }, { "epoch": 0.13684671167791948, "grad_norm": 0.9837234020233154, "learning_rate": 2e-05, "loss": 0.7391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2189, "tokens_per_second_per_gpu": 17040.51, "total_tokens": 216085226 }, { "epoch": 0.1369092273068267, "grad_norm": 0.9423041939735413, "learning_rate": 2e-05, "loss": 0.7419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2190, "tokens_per_second_per_gpu": 15942.48, "total_tokens": 216179067 }, { "epoch": 0.13697174293573394, "grad_norm": 0.9758543968200684, "learning_rate": 2e-05, "loss": 0.7654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2191, "tokens_per_second_per_gpu": 17908.06, "total_tokens": 216276511 }, { "epoch": 0.13703425856464116, "grad_norm": 0.9361265897750854, "learning_rate": 2e-05, "loss": 0.755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2192, "tokens_per_second_per_gpu": 17295.32, "total_tokens": 216377744 }, { "epoch": 0.13709677419354838, "grad_norm": 0.9710594415664673, "learning_rate": 2e-05, "loss": 0.7324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2193, "tokens_per_second_per_gpu": 15233.51, "total_tokens": 216469924 }, { "epoch": 0.13715928982245562, "grad_norm": 1.0002888441085815, "learning_rate": 2e-05, "loss": 0.7874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2194, "tokens_per_second_per_gpu": 17001.31, "total_tokens": 216567985 }, { "epoch": 0.13722180545136284, "grad_norm": 0.9573385119438171, "learning_rate": 2e-05, "loss": 0.7295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2195, "tokens_per_second_per_gpu": 15746.07, "total_tokens": 216659055 }, { "epoch": 0.13728432108027006, "grad_norm": 0.9658790826797485, "learning_rate": 2e-05, "loss": 0.7403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2196, "tokens_per_second_per_gpu": 17222.87, "total_tokens": 216758281 }, { "epoch": 0.1373468367091773, "grad_norm": 0.9105775952339172, "learning_rate": 2e-05, "loss": 0.734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2197, "tokens_per_second_per_gpu": 17451.26, "total_tokens": 216856658 }, { "epoch": 0.13740935233808452, "grad_norm": 0.8948565125465393, "learning_rate": 2e-05, "loss": 0.7243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2198, "tokens_per_second_per_gpu": 16764.16, "total_tokens": 216953428 }, { "epoch": 0.13747186796699173, "grad_norm": 0.9556677341461182, "learning_rate": 2e-05, "loss": 0.765, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2199, "tokens_per_second_per_gpu": 18193.07, "total_tokens": 217053840 }, { "epoch": 0.13753438359589898, "grad_norm": 0.9231751561164856, "learning_rate": 2e-05, "loss": 0.6919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2200, "tokens_per_second_per_gpu": 17003.8, "total_tokens": 217147422 }, { "epoch": 0.1375968992248062, "grad_norm": 0.8864774107933044, "learning_rate": 2e-05, "loss": 0.7078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2201, "tokens_per_second_per_gpu": 17241.64, "total_tokens": 217243615 }, { "epoch": 0.13765941485371344, "grad_norm": 0.9269843101501465, "learning_rate": 2e-05, "loss": 0.7821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2202, "tokens_per_second_per_gpu": 17046.97, "total_tokens": 217343426 }, { "epoch": 0.13772193048262066, "grad_norm": 0.935613751411438, "learning_rate": 2e-05, "loss": 0.7354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2203, "tokens_per_second_per_gpu": 15975.53, "total_tokens": 217438032 }, { "epoch": 0.13778444611152788, "grad_norm": 1.0230010747909546, "learning_rate": 2e-05, "loss": 0.7964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2204, "tokens_per_second_per_gpu": 17513.28, "total_tokens": 217535188 }, { "epoch": 0.13784696174043512, "grad_norm": 0.9320389032363892, "learning_rate": 2e-05, "loss": 0.7823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2205, "tokens_per_second_per_gpu": 17877.38, "total_tokens": 217637339 }, { "epoch": 0.13790947736934234, "grad_norm": 0.9304297566413879, "learning_rate": 2e-05, "loss": 0.7451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2206, "tokens_per_second_per_gpu": 16057.57, "total_tokens": 217733051 }, { "epoch": 0.13797199299824955, "grad_norm": 0.9086803793907166, "learning_rate": 2e-05, "loss": 0.767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2207, "tokens_per_second_per_gpu": 18351.89, "total_tokens": 217837891 }, { "epoch": 0.1380345086271568, "grad_norm": 0.9204855561256409, "learning_rate": 2e-05, "loss": 0.6836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2208, "tokens_per_second_per_gpu": 17629.0, "total_tokens": 217933768 }, { "epoch": 0.13809702425606402, "grad_norm": 0.960174024105072, "learning_rate": 2e-05, "loss": 0.7478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2209, "tokens_per_second_per_gpu": 16067.94, "total_tokens": 218029612 }, { "epoch": 0.13815953988497123, "grad_norm": 0.9173858165740967, "learning_rate": 2e-05, "loss": 0.7227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2210, "tokens_per_second_per_gpu": 17571.65, "total_tokens": 218130416 }, { "epoch": 0.13822205551387848, "grad_norm": 0.9430831074714661, "learning_rate": 2e-05, "loss": 0.749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2211, "tokens_per_second_per_gpu": 17866.17, "total_tokens": 218228886 }, { "epoch": 0.1382845711427857, "grad_norm": 0.9828084111213684, "learning_rate": 2e-05, "loss": 0.7561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2212, "tokens_per_second_per_gpu": 17319.48, "total_tokens": 218328599 }, { "epoch": 0.1383470867716929, "grad_norm": 0.956210732460022, "learning_rate": 2e-05, "loss": 0.7535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2213, "tokens_per_second_per_gpu": 16469.38, "total_tokens": 218424510 }, { "epoch": 0.13840960240060016, "grad_norm": 0.9713037014007568, "learning_rate": 2e-05, "loss": 0.7828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2214, "tokens_per_second_per_gpu": 16355.48, "total_tokens": 218517704 }, { "epoch": 0.13847211802950737, "grad_norm": 0.8975721001625061, "learning_rate": 2e-05, "loss": 0.746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2215, "tokens_per_second_per_gpu": 18082.25, "total_tokens": 218619687 }, { "epoch": 0.1385346336584146, "grad_norm": 1.0353502035140991, "learning_rate": 2e-05, "loss": 0.7884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2216, "tokens_per_second_per_gpu": 18108.46, "total_tokens": 218719497 }, { "epoch": 0.13859714928732184, "grad_norm": 0.9221534132957458, "learning_rate": 2e-05, "loss": 0.7157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2217, "tokens_per_second_per_gpu": 17474.54, "total_tokens": 218814361 }, { "epoch": 0.13865966491622905, "grad_norm": 0.9505317807197571, "learning_rate": 2e-05, "loss": 0.759, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2218, "tokens_per_second_per_gpu": 17283.26, "total_tokens": 218912552 }, { "epoch": 0.1387221805451363, "grad_norm": 0.9520562887191772, "learning_rate": 2e-05, "loss": 0.7333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2219, "tokens_per_second_per_gpu": 18776.04, "total_tokens": 219013101 }, { "epoch": 0.13878469617404351, "grad_norm": 1.0075483322143555, "learning_rate": 2e-05, "loss": 0.7644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2220, "tokens_per_second_per_gpu": 17776.91, "total_tokens": 219114337 }, { "epoch": 0.13884721180295073, "grad_norm": 0.9362083673477173, "learning_rate": 2e-05, "loss": 0.7504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2221, "tokens_per_second_per_gpu": 17799.46, "total_tokens": 219216859 }, { "epoch": 0.13890972743185798, "grad_norm": 0.9315237998962402, "learning_rate": 2e-05, "loss": 0.7509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2222, "tokens_per_second_per_gpu": 17898.76, "total_tokens": 219317876 }, { "epoch": 0.1389722430607652, "grad_norm": 0.9290388822555542, "learning_rate": 2e-05, "loss": 0.7822, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2223, "tokens_per_second_per_gpu": 18347.37, "total_tokens": 219420169 }, { "epoch": 0.1390347586896724, "grad_norm": 0.981429398059845, "learning_rate": 2e-05, "loss": 0.7943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2224, "tokens_per_second_per_gpu": 18977.89, "total_tokens": 219526182 }, { "epoch": 0.13909727431857966, "grad_norm": 0.9122908711433411, "learning_rate": 2e-05, "loss": 0.7355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2225, "tokens_per_second_per_gpu": 17013.95, "total_tokens": 219625763 }, { "epoch": 0.13915978994748687, "grad_norm": 0.9702645540237427, "learning_rate": 2e-05, "loss": 0.7531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2226, "tokens_per_second_per_gpu": 16996.08, "total_tokens": 219721572 }, { "epoch": 0.1392223055763941, "grad_norm": 0.9494742155075073, "learning_rate": 2e-05, "loss": 0.76, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2227, "tokens_per_second_per_gpu": 18129.51, "total_tokens": 219819930 }, { "epoch": 0.13928482120530133, "grad_norm": 0.9484351873397827, "learning_rate": 2e-05, "loss": 0.7553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2228, "tokens_per_second_per_gpu": 17368.57, "total_tokens": 219917400 }, { "epoch": 0.13934733683420855, "grad_norm": 0.9039738178253174, "learning_rate": 2e-05, "loss": 0.7446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2229, "tokens_per_second_per_gpu": 18430.11, "total_tokens": 220018091 }, { "epoch": 0.13940985246311577, "grad_norm": 0.9349783658981323, "learning_rate": 2e-05, "loss": 0.7376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2230, "tokens_per_second_per_gpu": 17077.37, "total_tokens": 220117448 }, { "epoch": 0.139472368092023, "grad_norm": 1.0071855783462524, "learning_rate": 2e-05, "loss": 0.7931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2231, "tokens_per_second_per_gpu": 18122.84, "total_tokens": 220219804 }, { "epoch": 0.13953488372093023, "grad_norm": 0.9348598718643188, "learning_rate": 2e-05, "loss": 0.7657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2232, "tokens_per_second_per_gpu": 17725.77, "total_tokens": 220319530 }, { "epoch": 0.13959739934983745, "grad_norm": 0.8784681558609009, "learning_rate": 2e-05, "loss": 0.7622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2233, "tokens_per_second_per_gpu": 18627.47, "total_tokens": 220423391 }, { "epoch": 0.1396599149787447, "grad_norm": 0.9273402094841003, "learning_rate": 2e-05, "loss": 0.7831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2234, "tokens_per_second_per_gpu": 16280.69, "total_tokens": 220521672 }, { "epoch": 0.1397224306076519, "grad_norm": 0.9804201722145081, "learning_rate": 2e-05, "loss": 0.7189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2235, "tokens_per_second_per_gpu": 18126.93, "total_tokens": 220620406 }, { "epoch": 0.13978494623655913, "grad_norm": 0.9342936873435974, "learning_rate": 2e-05, "loss": 0.7046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2236, "tokens_per_second_per_gpu": 14939.73, "total_tokens": 220713931 }, { "epoch": 0.13984746186546637, "grad_norm": 0.9405001401901245, "learning_rate": 2e-05, "loss": 0.7568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2237, "tokens_per_second_per_gpu": 16871.55, "total_tokens": 220808774 }, { "epoch": 0.1399099774943736, "grad_norm": 0.8846423625946045, "learning_rate": 2e-05, "loss": 0.7687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2238, "tokens_per_second_per_gpu": 17172.36, "total_tokens": 220910777 }, { "epoch": 0.13997249312328083, "grad_norm": 0.9807177782058716, "learning_rate": 2e-05, "loss": 0.7668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2239, "tokens_per_second_per_gpu": 16944.21, "total_tokens": 221007603 }, { "epoch": 0.14003500875218805, "grad_norm": 0.9725717306137085, "learning_rate": 2e-05, "loss": 0.7545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2240, "tokens_per_second_per_gpu": 17378.73, "total_tokens": 221105868 }, { "epoch": 0.14009752438109527, "grad_norm": 0.931201159954071, "learning_rate": 2e-05, "loss": 0.7574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2241, "tokens_per_second_per_gpu": 17331.54, "total_tokens": 221204187 }, { "epoch": 0.1401600400100025, "grad_norm": 0.9891663193702698, "learning_rate": 2e-05, "loss": 0.749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2242, "tokens_per_second_per_gpu": 16740.47, "total_tokens": 221301607 }, { "epoch": 0.14022255563890973, "grad_norm": 0.9702333807945251, "learning_rate": 2e-05, "loss": 0.7073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2243, "tokens_per_second_per_gpu": 16006.92, "total_tokens": 221392513 }, { "epoch": 0.14028507126781695, "grad_norm": 0.9522995948791504, "learning_rate": 2e-05, "loss": 0.7104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2244, "tokens_per_second_per_gpu": 16840.41, "total_tokens": 221486596 }, { "epoch": 0.1403475868967242, "grad_norm": 0.9881544709205627, "learning_rate": 2e-05, "loss": 0.7768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2245, "tokens_per_second_per_gpu": 15793.12, "total_tokens": 221581589 }, { "epoch": 0.1404101025256314, "grad_norm": 0.9575434923171997, "learning_rate": 2e-05, "loss": 0.778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2246, "tokens_per_second_per_gpu": 15802.02, "total_tokens": 221679827 }, { "epoch": 0.14047261815453863, "grad_norm": 0.9213532209396362, "learning_rate": 2e-05, "loss": 0.7478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2247, "tokens_per_second_per_gpu": 17362.62, "total_tokens": 221778243 }, { "epoch": 0.14053513378344587, "grad_norm": 0.9128805994987488, "learning_rate": 2e-05, "loss": 0.752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2248, "tokens_per_second_per_gpu": 17832.43, "total_tokens": 221880798 }, { "epoch": 0.1405976494123531, "grad_norm": 0.893683135509491, "learning_rate": 2e-05, "loss": 0.6941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2249, "tokens_per_second_per_gpu": 18928.22, "total_tokens": 221980263 }, { "epoch": 0.1406601650412603, "grad_norm": 0.9792488217353821, "learning_rate": 2e-05, "loss": 0.7453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2250, "tokens_per_second_per_gpu": 18750.45, "total_tokens": 222081098 }, { "epoch": 0.14072268067016755, "grad_norm": 0.9632937908172607, "learning_rate": 2e-05, "loss": 0.6894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2251, "tokens_per_second_per_gpu": 16450.04, "total_tokens": 222174424 }, { "epoch": 0.14078519629907477, "grad_norm": 0.9876323938369751, "learning_rate": 2e-05, "loss": 0.7941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2252, "tokens_per_second_per_gpu": 17653.39, "total_tokens": 222274153 }, { "epoch": 0.14084771192798198, "grad_norm": 0.9536519646644592, "learning_rate": 2e-05, "loss": 0.7516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2253, "tokens_per_second_per_gpu": 17753.36, "total_tokens": 222373194 }, { "epoch": 0.14091022755688923, "grad_norm": 0.931509792804718, "learning_rate": 2e-05, "loss": 0.7451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2254, "tokens_per_second_per_gpu": 16230.73, "total_tokens": 222467962 }, { "epoch": 0.14097274318579645, "grad_norm": 0.9573774337768555, "learning_rate": 2e-05, "loss": 0.723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2255, "tokens_per_second_per_gpu": 15576.2, "total_tokens": 222559156 }, { "epoch": 0.14103525881470366, "grad_norm": 0.9427846074104309, "learning_rate": 2e-05, "loss": 0.7446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2256, "tokens_per_second_per_gpu": 17574.37, "total_tokens": 222659002 }, { "epoch": 0.1410977744436109, "grad_norm": 0.9253461956977844, "learning_rate": 2e-05, "loss": 0.7531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2257, "tokens_per_second_per_gpu": 17986.3, "total_tokens": 222760206 }, { "epoch": 0.14116029007251812, "grad_norm": 0.9191311001777649, "learning_rate": 2e-05, "loss": 0.7082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2258, "tokens_per_second_per_gpu": 17209.92, "total_tokens": 222860837 }, { "epoch": 0.14122280570142537, "grad_norm": 0.9401894211769104, "learning_rate": 2e-05, "loss": 0.7562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2259, "tokens_per_second_per_gpu": 17715.24, "total_tokens": 222958599 }, { "epoch": 0.14128532133033259, "grad_norm": 0.9359370470046997, "learning_rate": 2e-05, "loss": 0.7772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2260, "tokens_per_second_per_gpu": 16916.48, "total_tokens": 223059127 }, { "epoch": 0.1413478369592398, "grad_norm": 0.9376384019851685, "learning_rate": 2e-05, "loss": 0.7222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2261, "tokens_per_second_per_gpu": 17154.96, "total_tokens": 223157296 }, { "epoch": 0.14141035258814705, "grad_norm": 0.9375320672988892, "learning_rate": 2e-05, "loss": 0.7705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2262, "tokens_per_second_per_gpu": 17507.89, "total_tokens": 223256544 }, { "epoch": 0.14147286821705427, "grad_norm": 0.970886766910553, "learning_rate": 2e-05, "loss": 0.721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2263, "tokens_per_second_per_gpu": 17162.44, "total_tokens": 223353676 }, { "epoch": 0.14153538384596148, "grad_norm": 0.9078238010406494, "learning_rate": 2e-05, "loss": 0.7597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2264, "tokens_per_second_per_gpu": 18598.38, "total_tokens": 223460500 }, { "epoch": 0.14159789947486873, "grad_norm": 0.9226902723312378, "learning_rate": 2e-05, "loss": 0.7544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2265, "tokens_per_second_per_gpu": 17204.05, "total_tokens": 223560002 }, { "epoch": 0.14166041510377594, "grad_norm": 0.9630988836288452, "learning_rate": 2e-05, "loss": 0.7684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2266, "tokens_per_second_per_gpu": 17388.99, "total_tokens": 223661068 }, { "epoch": 0.14172293073268316, "grad_norm": 0.9508355259895325, "learning_rate": 2e-05, "loss": 0.7793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2267, "tokens_per_second_per_gpu": 15890.68, "total_tokens": 223756503 }, { "epoch": 0.1417854463615904, "grad_norm": 0.9494704008102417, "learning_rate": 2e-05, "loss": 0.7863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2268, "tokens_per_second_per_gpu": 17642.25, "total_tokens": 223855996 }, { "epoch": 0.14184796199049762, "grad_norm": 0.961527407169342, "learning_rate": 2e-05, "loss": 0.7368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2269, "tokens_per_second_per_gpu": 16967.8, "total_tokens": 223950598 }, { "epoch": 0.14191047761940484, "grad_norm": 0.9136742353439331, "learning_rate": 2e-05, "loss": 0.7632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2270, "tokens_per_second_per_gpu": 18945.32, "total_tokens": 224055839 }, { "epoch": 0.14197299324831208, "grad_norm": 0.9489920735359192, "learning_rate": 2e-05, "loss": 0.7255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2271, "tokens_per_second_per_gpu": 17537.94, "total_tokens": 224150751 }, { "epoch": 0.1420355088772193, "grad_norm": 0.9448727369308472, "learning_rate": 2e-05, "loss": 0.7589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2272, "tokens_per_second_per_gpu": 16779.42, "total_tokens": 224250685 }, { "epoch": 0.14209802450612652, "grad_norm": 0.9718078374862671, "learning_rate": 2e-05, "loss": 0.7473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2273, "tokens_per_second_per_gpu": 17015.43, "total_tokens": 224345625 }, { "epoch": 0.14216054013503376, "grad_norm": 0.8857939839363098, "learning_rate": 2e-05, "loss": 0.7262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2274, "tokens_per_second_per_gpu": 17227.22, "total_tokens": 224446102 }, { "epoch": 0.14222305576394098, "grad_norm": 0.8835100531578064, "learning_rate": 2e-05, "loss": 0.7339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2275, "tokens_per_second_per_gpu": 17253.17, "total_tokens": 224544241 }, { "epoch": 0.14228557139284823, "grad_norm": 0.9450851082801819, "learning_rate": 2e-05, "loss": 0.7732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2276, "tokens_per_second_per_gpu": 18125.11, "total_tokens": 224645042 }, { "epoch": 0.14234808702175544, "grad_norm": 0.8782132863998413, "learning_rate": 2e-05, "loss": 0.7217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2277, "tokens_per_second_per_gpu": 17208.07, "total_tokens": 224742610 }, { "epoch": 0.14241060265066266, "grad_norm": 0.9135915040969849, "learning_rate": 2e-05, "loss": 0.7271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2278, "tokens_per_second_per_gpu": 16215.44, "total_tokens": 224840763 }, { "epoch": 0.1424731182795699, "grad_norm": 0.921917200088501, "learning_rate": 2e-05, "loss": 0.7055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2279, "tokens_per_second_per_gpu": 16216.51, "total_tokens": 224936619 }, { "epoch": 0.14253563390847712, "grad_norm": 0.9394552707672119, "learning_rate": 2e-05, "loss": 0.7486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2280, "tokens_per_second_per_gpu": 17522.08, "total_tokens": 225033232 }, { "epoch": 0.14259814953738434, "grad_norm": 0.9048981070518494, "learning_rate": 2e-05, "loss": 0.7253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2281, "tokens_per_second_per_gpu": 17832.74, "total_tokens": 225132747 }, { "epoch": 0.14266066516629158, "grad_norm": 0.9465477466583252, "learning_rate": 2e-05, "loss": 0.7577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2282, "tokens_per_second_per_gpu": 17201.02, "total_tokens": 225232117 }, { "epoch": 0.1427231807951988, "grad_norm": 0.9610021114349365, "learning_rate": 2e-05, "loss": 0.7894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2283, "tokens_per_second_per_gpu": 17620.73, "total_tokens": 225333237 }, { "epoch": 0.14278569642410602, "grad_norm": 0.906576931476593, "learning_rate": 2e-05, "loss": 0.7532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2284, "tokens_per_second_per_gpu": 18556.79, "total_tokens": 225434943 }, { "epoch": 0.14284821205301326, "grad_norm": 0.9110774397850037, "learning_rate": 2e-05, "loss": 0.7485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2285, "tokens_per_second_per_gpu": 15604.27, "total_tokens": 225532905 }, { "epoch": 0.14291072768192048, "grad_norm": 0.9427352547645569, "learning_rate": 2e-05, "loss": 0.7602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2286, "tokens_per_second_per_gpu": 17977.36, "total_tokens": 225632371 }, { "epoch": 0.1429732433108277, "grad_norm": 0.9181944727897644, "learning_rate": 2e-05, "loss": 0.7495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2287, "tokens_per_second_per_gpu": 18063.61, "total_tokens": 225732082 }, { "epoch": 0.14303575893973494, "grad_norm": 0.9570124745368958, "learning_rate": 2e-05, "loss": 0.7298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2288, "tokens_per_second_per_gpu": 15527.41, "total_tokens": 225826328 }, { "epoch": 0.14309827456864216, "grad_norm": 0.9348074197769165, "learning_rate": 2e-05, "loss": 0.7626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2289, "tokens_per_second_per_gpu": 17764.19, "total_tokens": 225926536 }, { "epoch": 0.14316079019754938, "grad_norm": 0.9275286197662354, "learning_rate": 2e-05, "loss": 0.7202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2290, "tokens_per_second_per_gpu": 16829.06, "total_tokens": 226019061 }, { "epoch": 0.14322330582645662, "grad_norm": 0.9540563225746155, "learning_rate": 2e-05, "loss": 0.7542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2291, "tokens_per_second_per_gpu": 16314.25, "total_tokens": 226114645 }, { "epoch": 0.14328582145536384, "grad_norm": 0.9296776652336121, "learning_rate": 2e-05, "loss": 0.7623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2292, "tokens_per_second_per_gpu": 17425.09, "total_tokens": 226213036 }, { "epoch": 0.14334833708427105, "grad_norm": 0.9592871069908142, "learning_rate": 2e-05, "loss": 0.7725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2293, "tokens_per_second_per_gpu": 18034.2, "total_tokens": 226311939 }, { "epoch": 0.1434108527131783, "grad_norm": 0.8944534659385681, "learning_rate": 2e-05, "loss": 0.7242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2294, "tokens_per_second_per_gpu": 17657.89, "total_tokens": 226411600 }, { "epoch": 0.14347336834208552, "grad_norm": 0.9471592903137207, "learning_rate": 2e-05, "loss": 0.7616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2295, "tokens_per_second_per_gpu": 18479.94, "total_tokens": 226513855 }, { "epoch": 0.14353588397099276, "grad_norm": 0.9191479086875916, "learning_rate": 2e-05, "loss": 0.7436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2296, "tokens_per_second_per_gpu": 17245.67, "total_tokens": 226613980 }, { "epoch": 0.14359839959989998, "grad_norm": 0.9878935217857361, "learning_rate": 2e-05, "loss": 0.7688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2297, "tokens_per_second_per_gpu": 17907.46, "total_tokens": 226715858 }, { "epoch": 0.1436609152288072, "grad_norm": 0.9418429136276245, "learning_rate": 2e-05, "loss": 0.7414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2298, "tokens_per_second_per_gpu": 17805.46, "total_tokens": 226816304 }, { "epoch": 0.14372343085771444, "grad_norm": 0.9154937267303467, "learning_rate": 2e-05, "loss": 0.6998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2299, "tokens_per_second_per_gpu": 16825.74, "total_tokens": 226913161 }, { "epoch": 0.14378594648662166, "grad_norm": 0.9767565131187439, "learning_rate": 2e-05, "loss": 0.7062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2300, "tokens_per_second_per_gpu": 17373.83, "total_tokens": 227014681 }, { "epoch": 0.14384846211552887, "grad_norm": 0.9851359724998474, "learning_rate": 2e-05, "loss": 0.7555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2301, "tokens_per_second_per_gpu": 17688.72, "total_tokens": 227112557 }, { "epoch": 0.14391097774443612, "grad_norm": 0.9521946907043457, "learning_rate": 2e-05, "loss": 0.7191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2302, "tokens_per_second_per_gpu": 17586.43, "total_tokens": 227206822 }, { "epoch": 0.14397349337334334, "grad_norm": 1.055992603302002, "learning_rate": 2e-05, "loss": 0.7211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2303, "tokens_per_second_per_gpu": 15871.22, "total_tokens": 227303267 }, { "epoch": 0.14403600900225055, "grad_norm": 0.997244119644165, "learning_rate": 2e-05, "loss": 0.6982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2304, "tokens_per_second_per_gpu": 16733.74, "total_tokens": 227400043 }, { "epoch": 0.1440985246311578, "grad_norm": 0.9086995124816895, "learning_rate": 2e-05, "loss": 0.751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2305, "tokens_per_second_per_gpu": 17858.66, "total_tokens": 227499879 }, { "epoch": 0.14416104026006502, "grad_norm": 0.9950711131095886, "learning_rate": 2e-05, "loss": 0.7792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2306, "tokens_per_second_per_gpu": 18527.24, "total_tokens": 227598425 }, { "epoch": 0.14422355588897223, "grad_norm": 1.0294321775436401, "learning_rate": 2e-05, "loss": 0.7918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2307, "tokens_per_second_per_gpu": 16868.77, "total_tokens": 227697580 }, { "epoch": 0.14428607151787948, "grad_norm": 0.9594703912734985, "learning_rate": 2e-05, "loss": 0.7666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2308, "tokens_per_second_per_gpu": 16438.54, "total_tokens": 227795693 }, { "epoch": 0.1443485871467867, "grad_norm": 0.9465631246566772, "learning_rate": 2e-05, "loss": 0.7285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2309, "tokens_per_second_per_gpu": 17387.77, "total_tokens": 227895944 }, { "epoch": 0.1444111027756939, "grad_norm": 0.967578113079071, "learning_rate": 2e-05, "loss": 0.7024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2310, "tokens_per_second_per_gpu": 17573.37, "total_tokens": 227995370 }, { "epoch": 0.14447361840460116, "grad_norm": 1.0847790241241455, "learning_rate": 2e-05, "loss": 0.7192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2311, "tokens_per_second_per_gpu": 15780.88, "total_tokens": 228089134 }, { "epoch": 0.14453613403350837, "grad_norm": 0.9728552103042603, "learning_rate": 2e-05, "loss": 0.792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2312, "tokens_per_second_per_gpu": 17498.89, "total_tokens": 228190866 }, { "epoch": 0.1445986496624156, "grad_norm": 1.0236238241195679, "learning_rate": 2e-05, "loss": 0.7452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2313, "tokens_per_second_per_gpu": 16897.97, "total_tokens": 228293991 }, { "epoch": 0.14466116529132284, "grad_norm": 1.016923427581787, "learning_rate": 2e-05, "loss": 0.7638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2314, "tokens_per_second_per_gpu": 18826.81, "total_tokens": 228392672 }, { "epoch": 0.14472368092023005, "grad_norm": 0.9913840293884277, "learning_rate": 2e-05, "loss": 0.7538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2315, "tokens_per_second_per_gpu": 18557.82, "total_tokens": 228493195 }, { "epoch": 0.1447861965491373, "grad_norm": 0.9555473327636719, "learning_rate": 2e-05, "loss": 0.7484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2316, "tokens_per_second_per_gpu": 16329.25, "total_tokens": 228589078 }, { "epoch": 0.14484871217804451, "grad_norm": 0.9505776762962341, "learning_rate": 2e-05, "loss": 0.7755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2317, "tokens_per_second_per_gpu": 18334.95, "total_tokens": 228690262 }, { "epoch": 0.14491122780695173, "grad_norm": 0.9709694385528564, "learning_rate": 2e-05, "loss": 0.7744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2318, "tokens_per_second_per_gpu": 16933.02, "total_tokens": 228790574 }, { "epoch": 0.14497374343585898, "grad_norm": 0.9652517437934875, "learning_rate": 2e-05, "loss": 0.8079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2319, "tokens_per_second_per_gpu": 18220.71, "total_tokens": 228892548 }, { "epoch": 0.1450362590647662, "grad_norm": 0.909625768661499, "learning_rate": 2e-05, "loss": 0.7837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2320, "tokens_per_second_per_gpu": 18603.38, "total_tokens": 228994792 }, { "epoch": 0.1450987746936734, "grad_norm": 0.9426104426383972, "learning_rate": 2e-05, "loss": 0.7544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2321, "tokens_per_second_per_gpu": 17563.11, "total_tokens": 229096124 }, { "epoch": 0.14516129032258066, "grad_norm": 1.0598948001861572, "learning_rate": 2e-05, "loss": 0.7412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2322, "tokens_per_second_per_gpu": 16403.74, "total_tokens": 229192824 }, { "epoch": 0.14522380595148787, "grad_norm": 0.942672848701477, "learning_rate": 2e-05, "loss": 0.7423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2323, "tokens_per_second_per_gpu": 16962.98, "total_tokens": 229290894 }, { "epoch": 0.1452863215803951, "grad_norm": 0.9285194277763367, "learning_rate": 2e-05, "loss": 0.7449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2324, "tokens_per_second_per_gpu": 16215.49, "total_tokens": 229387911 }, { "epoch": 0.14534883720930233, "grad_norm": 0.9150439500808716, "learning_rate": 2e-05, "loss": 0.734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2325, "tokens_per_second_per_gpu": 18309.19, "total_tokens": 229487355 }, { "epoch": 0.14541135283820955, "grad_norm": 0.9210245609283447, "learning_rate": 2e-05, "loss": 0.7473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2326, "tokens_per_second_per_gpu": 18957.79, "total_tokens": 229591316 }, { "epoch": 0.14547386846711677, "grad_norm": 0.9296494126319885, "learning_rate": 2e-05, "loss": 0.7284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2327, "tokens_per_second_per_gpu": 18273.63, "total_tokens": 229692522 }, { "epoch": 0.145536384096024, "grad_norm": 0.90159010887146, "learning_rate": 2e-05, "loss": 0.8116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2328, "tokens_per_second_per_gpu": 18696.09, "total_tokens": 229795845 }, { "epoch": 0.14559889972493123, "grad_norm": 0.9390770792961121, "learning_rate": 2e-05, "loss": 0.7445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2329, "tokens_per_second_per_gpu": 17091.17, "total_tokens": 229893091 }, { "epoch": 0.14566141535383845, "grad_norm": 0.9408666491508484, "learning_rate": 2e-05, "loss": 0.7797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2330, "tokens_per_second_per_gpu": 17663.7, "total_tokens": 229993532 }, { "epoch": 0.1457239309827457, "grad_norm": 0.9887981414794922, "learning_rate": 2e-05, "loss": 0.7809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2331, "tokens_per_second_per_gpu": 16866.99, "total_tokens": 230093205 }, { "epoch": 0.1457864466116529, "grad_norm": 0.9531126618385315, "learning_rate": 2e-05, "loss": 0.7592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2332, "tokens_per_second_per_gpu": 17978.94, "total_tokens": 230193154 }, { "epoch": 0.14584896224056015, "grad_norm": 0.9045240879058838, "learning_rate": 2e-05, "loss": 0.7486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2333, "tokens_per_second_per_gpu": 16910.69, "total_tokens": 230291840 }, { "epoch": 0.14591147786946737, "grad_norm": 0.9358149766921997, "learning_rate": 2e-05, "loss": 0.7005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2334, "tokens_per_second_per_gpu": 16549.05, "total_tokens": 230388826 }, { "epoch": 0.1459739934983746, "grad_norm": 0.9639768600463867, "learning_rate": 2e-05, "loss": 0.7562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2335, "tokens_per_second_per_gpu": 16869.77, "total_tokens": 230485897 }, { "epoch": 0.14603650912728183, "grad_norm": 1.0193935632705688, "learning_rate": 2e-05, "loss": 0.6954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2336, "tokens_per_second_per_gpu": 15252.69, "total_tokens": 230576468 }, { "epoch": 0.14609902475618905, "grad_norm": 0.9142275452613831, "learning_rate": 2e-05, "loss": 0.7285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2337, "tokens_per_second_per_gpu": 16679.7, "total_tokens": 230672948 }, { "epoch": 0.14616154038509627, "grad_norm": 0.9710169434547424, "learning_rate": 2e-05, "loss": 0.7406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2338, "tokens_per_second_per_gpu": 17077.83, "total_tokens": 230773293 }, { "epoch": 0.1462240560140035, "grad_norm": 0.9967721104621887, "learning_rate": 2e-05, "loss": 0.7608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2339, "tokens_per_second_per_gpu": 17705.26, "total_tokens": 230877390 }, { "epoch": 0.14628657164291073, "grad_norm": 0.9564122557640076, "learning_rate": 2e-05, "loss": 0.7139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2340, "tokens_per_second_per_gpu": 17925.87, "total_tokens": 230976100 }, { "epoch": 0.14634908727181795, "grad_norm": 0.9292271733283997, "learning_rate": 2e-05, "loss": 0.7902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2341, "tokens_per_second_per_gpu": 17303.23, "total_tokens": 231076354 }, { "epoch": 0.1464116029007252, "grad_norm": 0.9348763823509216, "learning_rate": 2e-05, "loss": 0.7406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2342, "tokens_per_second_per_gpu": 17449.21, "total_tokens": 231177741 }, { "epoch": 0.1464741185296324, "grad_norm": 0.8882434964179993, "learning_rate": 2e-05, "loss": 0.7406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2343, "tokens_per_second_per_gpu": 17794.2, "total_tokens": 231280020 }, { "epoch": 0.14653663415853962, "grad_norm": 0.9552232027053833, "learning_rate": 2e-05, "loss": 0.7415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2344, "tokens_per_second_per_gpu": 15971.35, "total_tokens": 231374908 }, { "epoch": 0.14659914978744687, "grad_norm": 0.9233739972114563, "learning_rate": 2e-05, "loss": 0.7668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2345, "tokens_per_second_per_gpu": 17798.42, "total_tokens": 231476140 }, { "epoch": 0.1466616654163541, "grad_norm": 0.9203407764434814, "learning_rate": 2e-05, "loss": 0.7892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2346, "tokens_per_second_per_gpu": 16846.82, "total_tokens": 231579808 }, { "epoch": 0.1467241810452613, "grad_norm": 1.0251703262329102, "learning_rate": 2e-05, "loss": 0.7422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2347, "tokens_per_second_per_gpu": 17218.13, "total_tokens": 231678072 }, { "epoch": 0.14678669667416855, "grad_norm": 0.955276370048523, "learning_rate": 2e-05, "loss": 0.7358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2348, "tokens_per_second_per_gpu": 16323.82, "total_tokens": 231774726 }, { "epoch": 0.14684921230307577, "grad_norm": 0.9001144170761108, "learning_rate": 2e-05, "loss": 0.7705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2349, "tokens_per_second_per_gpu": 18508.18, "total_tokens": 231879542 }, { "epoch": 0.14691172793198298, "grad_norm": 0.9354445934295654, "learning_rate": 2e-05, "loss": 0.6942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2350, "tokens_per_second_per_gpu": 18518.74, "total_tokens": 231981463 }, { "epoch": 0.14697424356089023, "grad_norm": 0.952812910079956, "learning_rate": 2e-05, "loss": 0.7966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2351, "tokens_per_second_per_gpu": 16997.45, "total_tokens": 232080694 }, { "epoch": 0.14703675918979744, "grad_norm": 0.8806135058403015, "learning_rate": 2e-05, "loss": 0.7367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2352, "tokens_per_second_per_gpu": 17542.74, "total_tokens": 232182224 }, { "epoch": 0.1470992748187047, "grad_norm": 0.9096969962120056, "learning_rate": 2e-05, "loss": 0.7395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2353, "tokens_per_second_per_gpu": 18094.16, "total_tokens": 232284528 }, { "epoch": 0.1471617904476119, "grad_norm": 0.9525404572486877, "learning_rate": 2e-05, "loss": 0.7433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2354, "tokens_per_second_per_gpu": 17056.07, "total_tokens": 232378644 }, { "epoch": 0.14722430607651912, "grad_norm": 0.9275920987129211, "learning_rate": 2e-05, "loss": 0.7125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2355, "tokens_per_second_per_gpu": 17867.53, "total_tokens": 232480419 }, { "epoch": 0.14728682170542637, "grad_norm": 0.9343903064727783, "learning_rate": 2e-05, "loss": 0.7689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2356, "tokens_per_second_per_gpu": 16910.97, "total_tokens": 232580636 }, { "epoch": 0.14734933733433359, "grad_norm": 0.9350087642669678, "learning_rate": 2e-05, "loss": 0.779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2357, "tokens_per_second_per_gpu": 17979.73, "total_tokens": 232681295 }, { "epoch": 0.1474118529632408, "grad_norm": 0.8900233507156372, "learning_rate": 2e-05, "loss": 0.7343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2358, "tokens_per_second_per_gpu": 17198.34, "total_tokens": 232779890 }, { "epoch": 0.14747436859214805, "grad_norm": 0.9346533417701721, "learning_rate": 2e-05, "loss": 0.7876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2359, "tokens_per_second_per_gpu": 17540.48, "total_tokens": 232882873 }, { "epoch": 0.14753688422105526, "grad_norm": 0.9186961054801941, "learning_rate": 2e-05, "loss": 0.7376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2360, "tokens_per_second_per_gpu": 17592.73, "total_tokens": 232980593 }, { "epoch": 0.14759939984996248, "grad_norm": 0.9707249999046326, "learning_rate": 2e-05, "loss": 0.7322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2361, "tokens_per_second_per_gpu": 17963.37, "total_tokens": 233082027 }, { "epoch": 0.14766191547886973, "grad_norm": 0.9165551662445068, "learning_rate": 2e-05, "loss": 0.7485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2362, "tokens_per_second_per_gpu": 17289.7, "total_tokens": 233180720 }, { "epoch": 0.14772443110777694, "grad_norm": 0.9115060567855835, "learning_rate": 2e-05, "loss": 0.7204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2363, "tokens_per_second_per_gpu": 15976.89, "total_tokens": 233276226 }, { "epoch": 0.14778694673668416, "grad_norm": 0.9031642079353333, "learning_rate": 2e-05, "loss": 0.7516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2364, "tokens_per_second_per_gpu": 18019.47, "total_tokens": 233378852 }, { "epoch": 0.1478494623655914, "grad_norm": 0.92325758934021, "learning_rate": 2e-05, "loss": 0.717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2365, "tokens_per_second_per_gpu": 16554.42, "total_tokens": 233474999 }, { "epoch": 0.14791197799449862, "grad_norm": 0.9226508736610413, "learning_rate": 2e-05, "loss": 0.7172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2366, "tokens_per_second_per_gpu": 17622.31, "total_tokens": 233572526 }, { "epoch": 0.14797449362340584, "grad_norm": 0.9110630750656128, "learning_rate": 2e-05, "loss": 0.8056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2367, "tokens_per_second_per_gpu": 17912.97, "total_tokens": 233676054 }, { "epoch": 0.14803700925231308, "grad_norm": 0.9347302317619324, "learning_rate": 2e-05, "loss": 0.7502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2368, "tokens_per_second_per_gpu": 16474.43, "total_tokens": 233775037 }, { "epoch": 0.1480995248812203, "grad_norm": 0.9413275718688965, "learning_rate": 2e-05, "loss": 0.7228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2369, "tokens_per_second_per_gpu": 17353.95, "total_tokens": 233872529 }, { "epoch": 0.14816204051012752, "grad_norm": 0.9386781454086304, "learning_rate": 2e-05, "loss": 0.8181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2370, "tokens_per_second_per_gpu": 18366.81, "total_tokens": 233978438 }, { "epoch": 0.14822455613903476, "grad_norm": 0.9189738035202026, "learning_rate": 2e-05, "loss": 0.7367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2371, "tokens_per_second_per_gpu": 16835.7, "total_tokens": 234074231 }, { "epoch": 0.14828707176794198, "grad_norm": 0.8832316398620605, "learning_rate": 2e-05, "loss": 0.7433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2372, "tokens_per_second_per_gpu": 18549.33, "total_tokens": 234180127 }, { "epoch": 0.14834958739684923, "grad_norm": 0.9252215623855591, "learning_rate": 2e-05, "loss": 0.7167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2373, "tokens_per_second_per_gpu": 16567.2, "total_tokens": 234276529 }, { "epoch": 0.14841210302575644, "grad_norm": 1.0150166749954224, "learning_rate": 2e-05, "loss": 0.7395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2374, "tokens_per_second_per_gpu": 17869.98, "total_tokens": 234374587 }, { "epoch": 0.14847461865466366, "grad_norm": 0.9380213022232056, "learning_rate": 2e-05, "loss": 0.7499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2375, "tokens_per_second_per_gpu": 17470.68, "total_tokens": 234473350 }, { "epoch": 0.1485371342835709, "grad_norm": 0.9412555694580078, "learning_rate": 2e-05, "loss": 0.758, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2376, "tokens_per_second_per_gpu": 18035.94, "total_tokens": 234573894 }, { "epoch": 0.14859964991247812, "grad_norm": 0.9281254410743713, "learning_rate": 2e-05, "loss": 0.7382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2377, "tokens_per_second_per_gpu": 16995.16, "total_tokens": 234672348 }, { "epoch": 0.14866216554138534, "grad_norm": 0.8887490034103394, "learning_rate": 2e-05, "loss": 0.6782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2378, "tokens_per_second_per_gpu": 17478.2, "total_tokens": 234770460 }, { "epoch": 0.14872468117029258, "grad_norm": 0.9629641771316528, "learning_rate": 2e-05, "loss": 0.7426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2379, "tokens_per_second_per_gpu": 17209.19, "total_tokens": 234865643 }, { "epoch": 0.1487871967991998, "grad_norm": 0.9139932990074158, "learning_rate": 2e-05, "loss": 0.8032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2380, "tokens_per_second_per_gpu": 17885.36, "total_tokens": 234968223 }, { "epoch": 0.14884971242810702, "grad_norm": 0.9337977766990662, "learning_rate": 2e-05, "loss": 0.7303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2381, "tokens_per_second_per_gpu": 17762.04, "total_tokens": 235067339 }, { "epoch": 0.14891222805701426, "grad_norm": 0.9307460784912109, "learning_rate": 2e-05, "loss": 0.7507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2382, "tokens_per_second_per_gpu": 17220.01, "total_tokens": 235166880 }, { "epoch": 0.14897474368592148, "grad_norm": 0.9181200861930847, "learning_rate": 2e-05, "loss": 0.7475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2383, "tokens_per_second_per_gpu": 17935.72, "total_tokens": 235267647 }, { "epoch": 0.1490372593148287, "grad_norm": 0.9173935651779175, "learning_rate": 2e-05, "loss": 0.7341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2384, "tokens_per_second_per_gpu": 16866.21, "total_tokens": 235361583 }, { "epoch": 0.14909977494373594, "grad_norm": 0.9317901730537415, "learning_rate": 2e-05, "loss": 0.7635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2385, "tokens_per_second_per_gpu": 16682.62, "total_tokens": 235462992 }, { "epoch": 0.14916229057264316, "grad_norm": 0.9271774291992188, "learning_rate": 2e-05, "loss": 0.7344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2386, "tokens_per_second_per_gpu": 17691.81, "total_tokens": 235562361 }, { "epoch": 0.14922480620155038, "grad_norm": 0.9245854020118713, "learning_rate": 2e-05, "loss": 0.7394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2387, "tokens_per_second_per_gpu": 18277.36, "total_tokens": 235659713 }, { "epoch": 0.14928732183045762, "grad_norm": 0.8632555603981018, "learning_rate": 2e-05, "loss": 0.7195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2388, "tokens_per_second_per_gpu": 17883.64, "total_tokens": 235759983 }, { "epoch": 0.14934983745936484, "grad_norm": 0.9051061868667603, "learning_rate": 2e-05, "loss": 0.7047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2389, "tokens_per_second_per_gpu": 17690.38, "total_tokens": 235861201 }, { "epoch": 0.14941235308827208, "grad_norm": 0.9042858481407166, "learning_rate": 2e-05, "loss": 0.7195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2390, "tokens_per_second_per_gpu": 18270.98, "total_tokens": 235961337 }, { "epoch": 0.1494748687171793, "grad_norm": 0.9023668169975281, "learning_rate": 2e-05, "loss": 0.711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2391, "tokens_per_second_per_gpu": 16937.35, "total_tokens": 236059869 }, { "epoch": 0.14953738434608652, "grad_norm": 0.9099642038345337, "learning_rate": 2e-05, "loss": 0.737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2392, "tokens_per_second_per_gpu": 16831.9, "total_tokens": 236158023 }, { "epoch": 0.14959989997499376, "grad_norm": 0.891064465045929, "learning_rate": 2e-05, "loss": 0.7008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2393, "tokens_per_second_per_gpu": 15890.33, "total_tokens": 236255113 }, { "epoch": 0.14966241560390098, "grad_norm": 0.9015365242958069, "learning_rate": 2e-05, "loss": 0.7044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2394, "tokens_per_second_per_gpu": 18224.97, "total_tokens": 236354146 }, { "epoch": 0.1497249312328082, "grad_norm": 0.9525061845779419, "learning_rate": 2e-05, "loss": 0.7625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2395, "tokens_per_second_per_gpu": 17508.71, "total_tokens": 236454061 }, { "epoch": 0.14978744686171544, "grad_norm": 0.8977583646774292, "learning_rate": 2e-05, "loss": 0.7846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2396, "tokens_per_second_per_gpu": 17918.51, "total_tokens": 236556295 }, { "epoch": 0.14984996249062266, "grad_norm": 0.9142245054244995, "learning_rate": 2e-05, "loss": 0.7208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2397, "tokens_per_second_per_gpu": 17187.75, "total_tokens": 236656437 }, { "epoch": 0.14991247811952987, "grad_norm": 0.9082240462303162, "learning_rate": 2e-05, "loss": 0.7391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2398, "tokens_per_second_per_gpu": 17553.2, "total_tokens": 236758547 }, { "epoch": 0.14997499374843712, "grad_norm": 0.9431834816932678, "learning_rate": 2e-05, "loss": 0.7437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2399, "tokens_per_second_per_gpu": 17908.3, "total_tokens": 236860677 }, { "epoch": 0.15003750937734434, "grad_norm": 0.9724439382553101, "learning_rate": 2e-05, "loss": 0.744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2400, "tokens_per_second_per_gpu": 16312.34, "total_tokens": 236956955 }, { "epoch": 0.15010002500625155, "grad_norm": 0.9258655905723572, "learning_rate": 2e-05, "loss": 0.7458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2401, "tokens_per_second_per_gpu": 17506.09, "total_tokens": 237054512 }, { "epoch": 0.1501625406351588, "grad_norm": 0.9079503417015076, "learning_rate": 2e-05, "loss": 0.7137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2402, "tokens_per_second_per_gpu": 18366.17, "total_tokens": 237154428 }, { "epoch": 0.15022505626406601, "grad_norm": 0.9106379151344299, "learning_rate": 2e-05, "loss": 0.745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2403, "tokens_per_second_per_gpu": 17341.74, "total_tokens": 237251053 }, { "epoch": 0.15028757189297323, "grad_norm": 0.9163603782653809, "learning_rate": 2e-05, "loss": 0.7457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2404, "tokens_per_second_per_gpu": 16634.79, "total_tokens": 237351498 }, { "epoch": 0.15035008752188048, "grad_norm": 0.937305212020874, "learning_rate": 2e-05, "loss": 0.7417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2405, "tokens_per_second_per_gpu": 15800.87, "total_tokens": 237447417 }, { "epoch": 0.1504126031507877, "grad_norm": 0.9245348572731018, "learning_rate": 2e-05, "loss": 0.7324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2406, "tokens_per_second_per_gpu": 18945.49, "total_tokens": 237549358 }, { "epoch": 0.1504751187796949, "grad_norm": 0.9213846921920776, "learning_rate": 2e-05, "loss": 0.7586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2407, "tokens_per_second_per_gpu": 17270.35, "total_tokens": 237648214 }, { "epoch": 0.15053763440860216, "grad_norm": 0.9306416511535645, "learning_rate": 2e-05, "loss": 0.7936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2408, "tokens_per_second_per_gpu": 17445.46, "total_tokens": 237751305 }, { "epoch": 0.15060015003750937, "grad_norm": 0.9184945225715637, "learning_rate": 2e-05, "loss": 0.7363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2409, "tokens_per_second_per_gpu": 18961.65, "total_tokens": 237853306 }, { "epoch": 0.15066266566641662, "grad_norm": 0.9654056429862976, "learning_rate": 2e-05, "loss": 0.7279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2410, "tokens_per_second_per_gpu": 17437.63, "total_tokens": 237952459 }, { "epoch": 0.15072518129532383, "grad_norm": 0.9498223066329956, "learning_rate": 2e-05, "loss": 0.7367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2411, "tokens_per_second_per_gpu": 17839.38, "total_tokens": 238049950 }, { "epoch": 0.15078769692423105, "grad_norm": 0.9189356565475464, "learning_rate": 2e-05, "loss": 0.7467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2412, "tokens_per_second_per_gpu": 17402.51, "total_tokens": 238149226 }, { "epoch": 0.1508502125531383, "grad_norm": 0.9758737683296204, "learning_rate": 2e-05, "loss": 0.7399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2413, "tokens_per_second_per_gpu": 17672.7, "total_tokens": 238248618 }, { "epoch": 0.1509127281820455, "grad_norm": 0.8921339511871338, "learning_rate": 2e-05, "loss": 0.772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2414, "tokens_per_second_per_gpu": 18490.94, "total_tokens": 238353060 }, { "epoch": 0.15097524381095273, "grad_norm": 0.9086699485778809, "learning_rate": 2e-05, "loss": 0.7394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2415, "tokens_per_second_per_gpu": 18523.4, "total_tokens": 238456247 }, { "epoch": 0.15103775943985998, "grad_norm": 0.9237386584281921, "learning_rate": 2e-05, "loss": 0.7245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2416, "tokens_per_second_per_gpu": 16834.38, "total_tokens": 238555432 }, { "epoch": 0.1511002750687672, "grad_norm": 0.9667184352874756, "learning_rate": 2e-05, "loss": 0.7329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2417, "tokens_per_second_per_gpu": 16528.76, "total_tokens": 238651630 }, { "epoch": 0.1511627906976744, "grad_norm": 0.9227876663208008, "learning_rate": 2e-05, "loss": 0.7787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2418, "tokens_per_second_per_gpu": 18313.9, "total_tokens": 238756364 }, { "epoch": 0.15122530632658165, "grad_norm": 0.9417151808738708, "learning_rate": 2e-05, "loss": 0.7556, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2419, "tokens_per_second_per_gpu": 17624.29, "total_tokens": 238857096 }, { "epoch": 0.15128782195548887, "grad_norm": 0.9170976281166077, "learning_rate": 2e-05, "loss": 0.6977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2420, "tokens_per_second_per_gpu": 17720.49, "total_tokens": 238958269 }, { "epoch": 0.1513503375843961, "grad_norm": 0.9164133667945862, "learning_rate": 2e-05, "loss": 0.7508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2421, "tokens_per_second_per_gpu": 17403.47, "total_tokens": 239056279 }, { "epoch": 0.15141285321330333, "grad_norm": 0.8929149508476257, "learning_rate": 2e-05, "loss": 0.724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2422, "tokens_per_second_per_gpu": 18297.28, "total_tokens": 239158892 }, { "epoch": 0.15147536884221055, "grad_norm": 0.9488248229026794, "learning_rate": 2e-05, "loss": 0.7114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2423, "tokens_per_second_per_gpu": 16920.48, "total_tokens": 239256670 }, { "epoch": 0.15153788447111777, "grad_norm": 0.9450362324714661, "learning_rate": 2e-05, "loss": 0.7452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2424, "tokens_per_second_per_gpu": 17144.32, "total_tokens": 239354555 }, { "epoch": 0.151600400100025, "grad_norm": 0.8872185349464417, "learning_rate": 2e-05, "loss": 0.7438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2425, "tokens_per_second_per_gpu": 17516.51, "total_tokens": 239456222 }, { "epoch": 0.15166291572893223, "grad_norm": 0.8949518203735352, "learning_rate": 2e-05, "loss": 0.7359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2426, "tokens_per_second_per_gpu": 16792.82, "total_tokens": 239557196 }, { "epoch": 0.15172543135783945, "grad_norm": 0.9032144546508789, "learning_rate": 2e-05, "loss": 0.7435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2427, "tokens_per_second_per_gpu": 17474.32, "total_tokens": 239661180 }, { "epoch": 0.1517879469867467, "grad_norm": 0.9262467622756958, "learning_rate": 2e-05, "loss": 0.7483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2428, "tokens_per_second_per_gpu": 17166.55, "total_tokens": 239759069 }, { "epoch": 0.1518504626156539, "grad_norm": 0.8986597061157227, "learning_rate": 2e-05, "loss": 0.7041, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2429, "tokens_per_second_per_gpu": 17113.21, "total_tokens": 239856952 }, { "epoch": 0.15191297824456115, "grad_norm": 0.910344660282135, "learning_rate": 2e-05, "loss": 0.7383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2430, "tokens_per_second_per_gpu": 17689.21, "total_tokens": 239957641 }, { "epoch": 0.15197549387346837, "grad_norm": 0.8871532678604126, "learning_rate": 2e-05, "loss": 0.6964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2431, "tokens_per_second_per_gpu": 16731.61, "total_tokens": 240053658 }, { "epoch": 0.1520380095023756, "grad_norm": 0.9024803638458252, "learning_rate": 2e-05, "loss": 0.7445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2432, "tokens_per_second_per_gpu": 17509.47, "total_tokens": 240157322 }, { "epoch": 0.15210052513128283, "grad_norm": 0.9218181371688843, "learning_rate": 2e-05, "loss": 0.7516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2433, "tokens_per_second_per_gpu": 17373.51, "total_tokens": 240253774 }, { "epoch": 0.15216304076019005, "grad_norm": 0.9245128035545349, "learning_rate": 2e-05, "loss": 0.7417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2434, "tokens_per_second_per_gpu": 17566.26, "total_tokens": 240352002 }, { "epoch": 0.15222555638909727, "grad_norm": 0.9048457145690918, "learning_rate": 2e-05, "loss": 0.6978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2435, "tokens_per_second_per_gpu": 17198.12, "total_tokens": 240448508 }, { "epoch": 0.1522880720180045, "grad_norm": 0.9183921217918396, "learning_rate": 2e-05, "loss": 0.7414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2436, "tokens_per_second_per_gpu": 17443.48, "total_tokens": 240549018 }, { "epoch": 0.15235058764691173, "grad_norm": 0.9030733108520508, "learning_rate": 2e-05, "loss": 0.7639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2437, "tokens_per_second_per_gpu": 17367.43, "total_tokens": 240649828 }, { "epoch": 0.15241310327581895, "grad_norm": 0.8934236764907837, "learning_rate": 2e-05, "loss": 0.7058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2438, "tokens_per_second_per_gpu": 15933.23, "total_tokens": 240745199 }, { "epoch": 0.1524756189047262, "grad_norm": 0.910855770111084, "learning_rate": 2e-05, "loss": 0.746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2439, "tokens_per_second_per_gpu": 17906.56, "total_tokens": 240844918 }, { "epoch": 0.1525381345336334, "grad_norm": 0.9281234741210938, "learning_rate": 2e-05, "loss": 0.7204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2440, "tokens_per_second_per_gpu": 17841.26, "total_tokens": 240944852 }, { "epoch": 0.15260065016254062, "grad_norm": 0.9288761615753174, "learning_rate": 2e-05, "loss": 0.7184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2441, "tokens_per_second_per_gpu": 18067.07, "total_tokens": 241046679 }, { "epoch": 0.15266316579144787, "grad_norm": 0.9507826566696167, "learning_rate": 2e-05, "loss": 0.7679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2442, "tokens_per_second_per_gpu": 18154.83, "total_tokens": 241150764 }, { "epoch": 0.15272568142035509, "grad_norm": 0.9230336546897888, "learning_rate": 2e-05, "loss": 0.7374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2443, "tokens_per_second_per_gpu": 17452.97, "total_tokens": 241247773 }, { "epoch": 0.1527881970492623, "grad_norm": 0.9596031904220581, "learning_rate": 2e-05, "loss": 0.7516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2444, "tokens_per_second_per_gpu": 18072.35, "total_tokens": 241348455 }, { "epoch": 0.15285071267816955, "grad_norm": 0.9017069935798645, "learning_rate": 2e-05, "loss": 0.7182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2445, "tokens_per_second_per_gpu": 17381.3, "total_tokens": 241447478 }, { "epoch": 0.15291322830707677, "grad_norm": 0.9257224798202515, "learning_rate": 2e-05, "loss": 0.809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2446, "tokens_per_second_per_gpu": 17317.57, "total_tokens": 241549285 }, { "epoch": 0.15297574393598398, "grad_norm": 0.9409357905387878, "learning_rate": 2e-05, "loss": 0.7777, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2447, "tokens_per_second_per_gpu": 17416.62, "total_tokens": 241650711 }, { "epoch": 0.15303825956489123, "grad_norm": 0.9163610935211182, "learning_rate": 2e-05, "loss": 0.7162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2448, "tokens_per_second_per_gpu": 16950.82, "total_tokens": 241750132 }, { "epoch": 0.15310077519379844, "grad_norm": 0.9227428436279297, "learning_rate": 2e-05, "loss": 0.7092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2449, "tokens_per_second_per_gpu": 17613.16, "total_tokens": 241851179 }, { "epoch": 0.1531632908227057, "grad_norm": 0.9125744700431824, "learning_rate": 2e-05, "loss": 0.7592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2450, "tokens_per_second_per_gpu": 18864.35, "total_tokens": 241952389 }, { "epoch": 0.1532258064516129, "grad_norm": 0.9285277724266052, "learning_rate": 2e-05, "loss": 0.7714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2451, "tokens_per_second_per_gpu": 18123.12, "total_tokens": 242052732 }, { "epoch": 0.15328832208052012, "grad_norm": 0.9226130247116089, "learning_rate": 2e-05, "loss": 0.7307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2452, "tokens_per_second_per_gpu": 17838.47, "total_tokens": 242149345 }, { "epoch": 0.15335083770942737, "grad_norm": 0.932012677192688, "learning_rate": 2e-05, "loss": 0.7193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2453, "tokens_per_second_per_gpu": 17180.62, "total_tokens": 242243569 }, { "epoch": 0.15341335333833458, "grad_norm": 0.902216374874115, "learning_rate": 2e-05, "loss": 0.7278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2454, "tokens_per_second_per_gpu": 18552.09, "total_tokens": 242347571 }, { "epoch": 0.1534758689672418, "grad_norm": 0.9569623470306396, "learning_rate": 2e-05, "loss": 0.7339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2455, "tokens_per_second_per_gpu": 16882.09, "total_tokens": 242447256 }, { "epoch": 0.15353838459614905, "grad_norm": 0.9209882020950317, "learning_rate": 2e-05, "loss": 0.7272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2456, "tokens_per_second_per_gpu": 17534.93, "total_tokens": 242550040 }, { "epoch": 0.15360090022505626, "grad_norm": 0.934664785861969, "learning_rate": 2e-05, "loss": 0.7682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2457, "tokens_per_second_per_gpu": 18826.09, "total_tokens": 242648494 }, { "epoch": 0.15366341585396348, "grad_norm": 0.9197582006454468, "learning_rate": 2e-05, "loss": 0.738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2458, "tokens_per_second_per_gpu": 17057.98, "total_tokens": 242745818 }, { "epoch": 0.15372593148287073, "grad_norm": 0.9485012292861938, "learning_rate": 2e-05, "loss": 0.7543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2459, "tokens_per_second_per_gpu": 18125.08, "total_tokens": 242847063 }, { "epoch": 0.15378844711177794, "grad_norm": 0.9414323568344116, "learning_rate": 2e-05, "loss": 0.7637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2460, "tokens_per_second_per_gpu": 16588.36, "total_tokens": 242946733 }, { "epoch": 0.15385096274068516, "grad_norm": 0.9335825443267822, "learning_rate": 2e-05, "loss": 0.7247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2461, "tokens_per_second_per_gpu": 16476.23, "total_tokens": 243042777 }, { "epoch": 0.1539134783695924, "grad_norm": 0.8947253823280334, "learning_rate": 2e-05, "loss": 0.7318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2462, "tokens_per_second_per_gpu": 17444.6, "total_tokens": 243141963 }, { "epoch": 0.15397599399849962, "grad_norm": 0.9516562223434448, "learning_rate": 2e-05, "loss": 0.7375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2463, "tokens_per_second_per_gpu": 18013.32, "total_tokens": 243246145 }, { "epoch": 0.15403850962740684, "grad_norm": 0.9103440642356873, "learning_rate": 2e-05, "loss": 0.7636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2464, "tokens_per_second_per_gpu": 18826.97, "total_tokens": 243350924 }, { "epoch": 0.15410102525631408, "grad_norm": 0.8973153233528137, "learning_rate": 2e-05, "loss": 0.729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2465, "tokens_per_second_per_gpu": 16735.05, "total_tokens": 243451509 }, { "epoch": 0.1541635408852213, "grad_norm": 0.9179115295410156, "learning_rate": 2e-05, "loss": 0.7505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2466, "tokens_per_second_per_gpu": 16625.97, "total_tokens": 243546997 }, { "epoch": 0.15422605651412855, "grad_norm": 0.9120661020278931, "learning_rate": 2e-05, "loss": 0.7117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2467, "tokens_per_second_per_gpu": 16755.1, "total_tokens": 243642454 }, { "epoch": 0.15428857214303576, "grad_norm": 0.932219922542572, "learning_rate": 2e-05, "loss": 0.7486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2468, "tokens_per_second_per_gpu": 18945.93, "total_tokens": 243745072 }, { "epoch": 0.15435108777194298, "grad_norm": 0.9303089380264282, "learning_rate": 2e-05, "loss": 0.7722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2469, "tokens_per_second_per_gpu": 17543.17, "total_tokens": 243846194 }, { "epoch": 0.15441360340085022, "grad_norm": 0.9303684234619141, "learning_rate": 2e-05, "loss": 0.7535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2470, "tokens_per_second_per_gpu": 16989.68, "total_tokens": 243944547 }, { "epoch": 0.15447611902975744, "grad_norm": 0.9268680810928345, "learning_rate": 2e-05, "loss": 0.7186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2471, "tokens_per_second_per_gpu": 16818.43, "total_tokens": 244042038 }, { "epoch": 0.15453863465866466, "grad_norm": 0.9315420985221863, "learning_rate": 2e-05, "loss": 0.7252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2472, "tokens_per_second_per_gpu": 17272.42, "total_tokens": 244138434 }, { "epoch": 0.1546011502875719, "grad_norm": 0.9528121948242188, "learning_rate": 2e-05, "loss": 0.8028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2473, "tokens_per_second_per_gpu": 18766.72, "total_tokens": 244245010 }, { "epoch": 0.15466366591647912, "grad_norm": 0.9078012108802795, "learning_rate": 2e-05, "loss": 0.7135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2474, "tokens_per_second_per_gpu": 17445.4, "total_tokens": 244343163 }, { "epoch": 0.15472618154538634, "grad_norm": 0.9293259978294373, "learning_rate": 2e-05, "loss": 0.795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2475, "tokens_per_second_per_gpu": 17490.77, "total_tokens": 244443246 }, { "epoch": 0.15478869717429358, "grad_norm": 0.9233952760696411, "learning_rate": 2e-05, "loss": 0.8069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2476, "tokens_per_second_per_gpu": 18505.66, "total_tokens": 244543892 }, { "epoch": 0.1548512128032008, "grad_norm": 0.9061563611030579, "learning_rate": 2e-05, "loss": 0.7421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2477, "tokens_per_second_per_gpu": 17334.96, "total_tokens": 244642522 }, { "epoch": 0.15491372843210802, "grad_norm": 0.8909261226654053, "learning_rate": 2e-05, "loss": 0.6967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2478, "tokens_per_second_per_gpu": 16402.21, "total_tokens": 244738893 }, { "epoch": 0.15497624406101526, "grad_norm": 0.9425103068351746, "learning_rate": 2e-05, "loss": 0.7465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2479, "tokens_per_second_per_gpu": 17907.72, "total_tokens": 244839025 }, { "epoch": 0.15503875968992248, "grad_norm": 0.9723900556564331, "learning_rate": 2e-05, "loss": 0.7337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2480, "tokens_per_second_per_gpu": 17694.79, "total_tokens": 244938184 }, { "epoch": 0.1551012753188297, "grad_norm": 0.9141882061958313, "learning_rate": 2e-05, "loss": 0.7917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2481, "tokens_per_second_per_gpu": 18767.6, "total_tokens": 245042187 }, { "epoch": 0.15516379094773694, "grad_norm": 0.9192681312561035, "learning_rate": 2e-05, "loss": 0.6981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2482, "tokens_per_second_per_gpu": 16422.26, "total_tokens": 245134955 }, { "epoch": 0.15522630657664416, "grad_norm": 0.9792819619178772, "learning_rate": 2e-05, "loss": 0.7386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2483, "tokens_per_second_per_gpu": 17210.51, "total_tokens": 245233965 }, { "epoch": 0.15528882220555137, "grad_norm": 0.9165924787521362, "learning_rate": 2e-05, "loss": 0.7275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2484, "tokens_per_second_per_gpu": 16416.73, "total_tokens": 245331722 }, { "epoch": 0.15535133783445862, "grad_norm": 0.920721173286438, "learning_rate": 2e-05, "loss": 0.6976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2485, "tokens_per_second_per_gpu": 17270.75, "total_tokens": 245430717 }, { "epoch": 0.15541385346336584, "grad_norm": 0.9359063506126404, "learning_rate": 2e-05, "loss": 0.7425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2486, "tokens_per_second_per_gpu": 18105.37, "total_tokens": 245531518 }, { "epoch": 0.15547636909227308, "grad_norm": 0.9713334441184998, "learning_rate": 2e-05, "loss": 0.8044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2487, "tokens_per_second_per_gpu": 18410.73, "total_tokens": 245632448 }, { "epoch": 0.1555388847211803, "grad_norm": 0.934980571269989, "learning_rate": 2e-05, "loss": 0.7331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2488, "tokens_per_second_per_gpu": 16754.96, "total_tokens": 245731842 }, { "epoch": 0.15560140035008752, "grad_norm": 0.9474225044250488, "learning_rate": 2e-05, "loss": 0.7751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2489, "tokens_per_second_per_gpu": 17888.98, "total_tokens": 245833665 }, { "epoch": 0.15566391597899476, "grad_norm": 0.9611337780952454, "learning_rate": 2e-05, "loss": 0.7113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2490, "tokens_per_second_per_gpu": 17294.77, "total_tokens": 245934280 }, { "epoch": 0.15572643160790198, "grad_norm": 0.8950450420379639, "learning_rate": 2e-05, "loss": 0.7235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2491, "tokens_per_second_per_gpu": 17506.65, "total_tokens": 246035708 }, { "epoch": 0.1557889472368092, "grad_norm": 0.940353274345398, "learning_rate": 2e-05, "loss": 0.7283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2492, "tokens_per_second_per_gpu": 16982.57, "total_tokens": 246133976 }, { "epoch": 0.15585146286571644, "grad_norm": 0.9030771851539612, "learning_rate": 2e-05, "loss": 0.7389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2493, "tokens_per_second_per_gpu": 16965.2, "total_tokens": 246234662 }, { "epoch": 0.15591397849462366, "grad_norm": 0.986770749092102, "learning_rate": 2e-05, "loss": 0.7523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2494, "tokens_per_second_per_gpu": 17668.09, "total_tokens": 246333010 }, { "epoch": 0.15597649412353087, "grad_norm": 0.9415619969367981, "learning_rate": 2e-05, "loss": 0.6873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2495, "tokens_per_second_per_gpu": 16464.01, "total_tokens": 246423933 }, { "epoch": 0.15603900975243812, "grad_norm": 0.9871627688407898, "learning_rate": 2e-05, "loss": 0.7195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2496, "tokens_per_second_per_gpu": 17629.92, "total_tokens": 246524173 }, { "epoch": 0.15610152538134534, "grad_norm": 0.9107473492622375, "learning_rate": 2e-05, "loss": 0.7206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2497, "tokens_per_second_per_gpu": 17946.52, "total_tokens": 246622922 }, { "epoch": 0.15616404101025255, "grad_norm": 0.94288170337677, "learning_rate": 2e-05, "loss": 0.7274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2498, "tokens_per_second_per_gpu": 16763.9, "total_tokens": 246718313 }, { "epoch": 0.1562265566391598, "grad_norm": 0.9263092279434204, "learning_rate": 2e-05, "loss": 0.7588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2499, "tokens_per_second_per_gpu": 17517.2, "total_tokens": 246818539 }, { "epoch": 0.15628907226806701, "grad_norm": 0.9593397378921509, "learning_rate": 2e-05, "loss": 0.7331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2500, "tokens_per_second_per_gpu": 18429.83, "total_tokens": 246920971 }, { "epoch": 0.15635158789697423, "grad_norm": 0.9383399486541748, "learning_rate": 2e-05, "loss": 0.7338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2501, "tokens_per_second_per_gpu": 17430.43, "total_tokens": 247022616 }, { "epoch": 0.15641410352588148, "grad_norm": 0.9655659198760986, "learning_rate": 2e-05, "loss": 0.7145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2502, "tokens_per_second_per_gpu": 16917.15, "total_tokens": 247121571 }, { "epoch": 0.1564766191547887, "grad_norm": 0.9550907015800476, "learning_rate": 2e-05, "loss": 0.7511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2503, "tokens_per_second_per_gpu": 17217.12, "total_tokens": 247221077 }, { "epoch": 0.1565391347836959, "grad_norm": 1.010575532913208, "learning_rate": 2e-05, "loss": 0.7001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2504, "tokens_per_second_per_gpu": 16161.25, "total_tokens": 247318610 }, { "epoch": 0.15660165041260315, "grad_norm": 0.9898800849914551, "learning_rate": 2e-05, "loss": 0.7551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2505, "tokens_per_second_per_gpu": 17062.85, "total_tokens": 247416517 }, { "epoch": 0.15666416604151037, "grad_norm": 0.9413971304893494, "learning_rate": 2e-05, "loss": 0.7713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2506, "tokens_per_second_per_gpu": 17657.09, "total_tokens": 247517087 }, { "epoch": 0.15672668167041762, "grad_norm": 0.9583605527877808, "learning_rate": 2e-05, "loss": 0.7384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2507, "tokens_per_second_per_gpu": 16537.18, "total_tokens": 247612792 }, { "epoch": 0.15678919729932483, "grad_norm": 0.9007663726806641, "learning_rate": 2e-05, "loss": 0.7126, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2508, "tokens_per_second_per_gpu": 16964.79, "total_tokens": 247711391 }, { "epoch": 0.15685171292823205, "grad_norm": 0.912754476070404, "learning_rate": 2e-05, "loss": 0.7517, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2509, "tokens_per_second_per_gpu": 18221.58, "total_tokens": 247815041 }, { "epoch": 0.1569142285571393, "grad_norm": 0.8981224298477173, "learning_rate": 2e-05, "loss": 0.7907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2510, "tokens_per_second_per_gpu": 17878.26, "total_tokens": 247918062 }, { "epoch": 0.1569767441860465, "grad_norm": 0.9471485018730164, "learning_rate": 2e-05, "loss": 0.7195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2511, "tokens_per_second_per_gpu": 16799.16, "total_tokens": 248014386 }, { "epoch": 0.15703925981495373, "grad_norm": 0.8837004899978638, "learning_rate": 2e-05, "loss": 0.7182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2512, "tokens_per_second_per_gpu": 17597.12, "total_tokens": 248113994 }, { "epoch": 0.15710177544386097, "grad_norm": 1.0170315504074097, "learning_rate": 2e-05, "loss": 0.7162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2513, "tokens_per_second_per_gpu": 17400.29, "total_tokens": 248206874 }, { "epoch": 0.1571642910727682, "grad_norm": 0.8975809216499329, "learning_rate": 2e-05, "loss": 0.7124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2514, "tokens_per_second_per_gpu": 17980.45, "total_tokens": 248306926 }, { "epoch": 0.1572268067016754, "grad_norm": 0.9388651847839355, "learning_rate": 2e-05, "loss": 0.6842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2515, "tokens_per_second_per_gpu": 17026.76, "total_tokens": 248402300 }, { "epoch": 0.15728932233058265, "grad_norm": 0.8968009352684021, "learning_rate": 2e-05, "loss": 0.7345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2516, "tokens_per_second_per_gpu": 18315.38, "total_tokens": 248508578 }, { "epoch": 0.15735183795948987, "grad_norm": 0.9085729122161865, "learning_rate": 2e-05, "loss": 0.7326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2517, "tokens_per_second_per_gpu": 17044.03, "total_tokens": 248606660 }, { "epoch": 0.1574143535883971, "grad_norm": 0.9014217257499695, "learning_rate": 2e-05, "loss": 0.7641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2518, "tokens_per_second_per_gpu": 17970.98, "total_tokens": 248709305 }, { "epoch": 0.15747686921730433, "grad_norm": 0.8867865204811096, "learning_rate": 2e-05, "loss": 0.7482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2519, "tokens_per_second_per_gpu": 17494.64, "total_tokens": 248808494 }, { "epoch": 0.15753938484621155, "grad_norm": 0.9312760829925537, "learning_rate": 2e-05, "loss": 0.7487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2520, "tokens_per_second_per_gpu": 16911.84, "total_tokens": 248904231 }, { "epoch": 0.15760190047511877, "grad_norm": 0.886488139629364, "learning_rate": 2e-05, "loss": 0.7205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2521, "tokens_per_second_per_gpu": 16996.1, "total_tokens": 249003248 }, { "epoch": 0.157664416104026, "grad_norm": 0.9415410161018372, "learning_rate": 2e-05, "loss": 0.7027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2522, "tokens_per_second_per_gpu": 17359.86, "total_tokens": 249100809 }, { "epoch": 0.15772693173293323, "grad_norm": 0.9178008437156677, "learning_rate": 2e-05, "loss": 0.7704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2523, "tokens_per_second_per_gpu": 17116.35, "total_tokens": 249202495 }, { "epoch": 0.15778944736184047, "grad_norm": 0.9574923515319824, "learning_rate": 2e-05, "loss": 0.7438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2524, "tokens_per_second_per_gpu": 18577.61, "total_tokens": 249303834 }, { "epoch": 0.1578519629907477, "grad_norm": 0.9193930625915527, "learning_rate": 2e-05, "loss": 0.7093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2525, "tokens_per_second_per_gpu": 17260.55, "total_tokens": 249404127 }, { "epoch": 0.1579144786196549, "grad_norm": 0.882026195526123, "learning_rate": 2e-05, "loss": 0.7506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2526, "tokens_per_second_per_gpu": 18143.84, "total_tokens": 249506200 }, { "epoch": 0.15797699424856215, "grad_norm": 0.9272036552429199, "learning_rate": 2e-05, "loss": 0.745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2527, "tokens_per_second_per_gpu": 17924.23, "total_tokens": 249606826 }, { "epoch": 0.15803950987746937, "grad_norm": 0.9351441860198975, "learning_rate": 2e-05, "loss": 0.7906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2528, "tokens_per_second_per_gpu": 18552.58, "total_tokens": 249711178 }, { "epoch": 0.1581020255063766, "grad_norm": 0.8827376365661621, "learning_rate": 2e-05, "loss": 0.7141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2529, "tokens_per_second_per_gpu": 18142.37, "total_tokens": 249808757 }, { "epoch": 0.15816454113528383, "grad_norm": 0.9008426070213318, "learning_rate": 2e-05, "loss": 0.7387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2530, "tokens_per_second_per_gpu": 17502.94, "total_tokens": 249908983 }, { "epoch": 0.15822705676419105, "grad_norm": 0.8738570213317871, "learning_rate": 2e-05, "loss": 0.7159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2531, "tokens_per_second_per_gpu": 17188.19, "total_tokens": 250010693 }, { "epoch": 0.15828957239309827, "grad_norm": 0.9959209561347961, "learning_rate": 2e-05, "loss": 0.7648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2532, "tokens_per_second_per_gpu": 17740.31, "total_tokens": 250106424 }, { "epoch": 0.1583520880220055, "grad_norm": 1.0119807720184326, "learning_rate": 2e-05, "loss": 0.7333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2533, "tokens_per_second_per_gpu": 17793.88, "total_tokens": 250207452 }, { "epoch": 0.15841460365091273, "grad_norm": 0.9406057000160217, "learning_rate": 2e-05, "loss": 0.7182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2534, "tokens_per_second_per_gpu": 15507.33, "total_tokens": 250301818 }, { "epoch": 0.15847711927981994, "grad_norm": 0.9417920112609863, "learning_rate": 2e-05, "loss": 0.7418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2535, "tokens_per_second_per_gpu": 17410.44, "total_tokens": 250399612 }, { "epoch": 0.1585396349087272, "grad_norm": 0.9076671600341797, "learning_rate": 2e-05, "loss": 0.7164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2536, "tokens_per_second_per_gpu": 18168.6, "total_tokens": 250499907 }, { "epoch": 0.1586021505376344, "grad_norm": 0.9409372210502625, "learning_rate": 2e-05, "loss": 0.7315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2537, "tokens_per_second_per_gpu": 17472.06, "total_tokens": 250596721 }, { "epoch": 0.15866466616654162, "grad_norm": 0.9276371598243713, "learning_rate": 2e-05, "loss": 0.7547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2538, "tokens_per_second_per_gpu": 18579.36, "total_tokens": 250698295 }, { "epoch": 0.15872718179544887, "grad_norm": 0.9451554417610168, "learning_rate": 2e-05, "loss": 0.7442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2539, "tokens_per_second_per_gpu": 17698.93, "total_tokens": 250799590 }, { "epoch": 0.15878969742435609, "grad_norm": 0.9019706845283508, "learning_rate": 2e-05, "loss": 0.7687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2540, "tokens_per_second_per_gpu": 17934.91, "total_tokens": 250901559 }, { "epoch": 0.1588522130532633, "grad_norm": 0.9502086043357849, "learning_rate": 2e-05, "loss": 0.7284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2541, "tokens_per_second_per_gpu": 15950.95, "total_tokens": 250995736 }, { "epoch": 0.15891472868217055, "grad_norm": 0.9469085931777954, "learning_rate": 2e-05, "loss": 0.703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2542, "tokens_per_second_per_gpu": 16087.41, "total_tokens": 251090634 }, { "epoch": 0.15897724431107776, "grad_norm": 0.8922246098518372, "learning_rate": 2e-05, "loss": 0.7159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2543, "tokens_per_second_per_gpu": 16833.91, "total_tokens": 251189390 }, { "epoch": 0.159039759939985, "grad_norm": 0.9121125936508179, "learning_rate": 2e-05, "loss": 0.7321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2544, "tokens_per_second_per_gpu": 17845.51, "total_tokens": 251287544 }, { "epoch": 0.15910227556889223, "grad_norm": 0.8810409903526306, "learning_rate": 2e-05, "loss": 0.6937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2545, "tokens_per_second_per_gpu": 15629.49, "total_tokens": 251383339 }, { "epoch": 0.15916479119779944, "grad_norm": 0.8949509263038635, "learning_rate": 2e-05, "loss": 0.7384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2546, "tokens_per_second_per_gpu": 17651.28, "total_tokens": 251483135 }, { "epoch": 0.1592273068267067, "grad_norm": 0.8954589366912842, "learning_rate": 2e-05, "loss": 0.7402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2547, "tokens_per_second_per_gpu": 18388.13, "total_tokens": 251585679 }, { "epoch": 0.1592898224556139, "grad_norm": 0.9072478413581848, "learning_rate": 2e-05, "loss": 0.7355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2548, "tokens_per_second_per_gpu": 17872.85, "total_tokens": 251685385 }, { "epoch": 0.15935233808452112, "grad_norm": 0.915660560131073, "learning_rate": 2e-05, "loss": 0.7212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2549, "tokens_per_second_per_gpu": 15850.83, "total_tokens": 251781348 }, { "epoch": 0.15941485371342837, "grad_norm": 0.9478598237037659, "learning_rate": 2e-05, "loss": 0.6932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2550, "tokens_per_second_per_gpu": 16000.48, "total_tokens": 251876520 }, { "epoch": 0.15947736934233558, "grad_norm": 0.9391351938247681, "learning_rate": 2e-05, "loss": 0.7163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2551, "tokens_per_second_per_gpu": 17239.29, "total_tokens": 251975584 }, { "epoch": 0.1595398849712428, "grad_norm": 0.943652331829071, "learning_rate": 2e-05, "loss": 0.7167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2552, "tokens_per_second_per_gpu": 16563.48, "total_tokens": 252069801 }, { "epoch": 0.15960240060015005, "grad_norm": 0.935165286064148, "learning_rate": 2e-05, "loss": 0.699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2553, "tokens_per_second_per_gpu": 16901.39, "total_tokens": 252166018 }, { "epoch": 0.15966491622905726, "grad_norm": 0.9225239753723145, "learning_rate": 2e-05, "loss": 0.7148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2554, "tokens_per_second_per_gpu": 17907.14, "total_tokens": 252263452 }, { "epoch": 0.15972743185796448, "grad_norm": 0.9811998009681702, "learning_rate": 2e-05, "loss": 0.7674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2555, "tokens_per_second_per_gpu": 15365.33, "total_tokens": 252356879 }, { "epoch": 0.15978994748687173, "grad_norm": 0.9249045848846436, "learning_rate": 2e-05, "loss": 0.6958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2556, "tokens_per_second_per_gpu": 15990.06, "total_tokens": 252451630 }, { "epoch": 0.15985246311577894, "grad_norm": 0.9403735399246216, "learning_rate": 2e-05, "loss": 0.7681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2557, "tokens_per_second_per_gpu": 18429.83, "total_tokens": 252556696 }, { "epoch": 0.15991497874468616, "grad_norm": 0.9252206683158875, "learning_rate": 2e-05, "loss": 0.7296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2558, "tokens_per_second_per_gpu": 17243.0, "total_tokens": 252656140 }, { "epoch": 0.1599774943735934, "grad_norm": 0.9088497161865234, "learning_rate": 2e-05, "loss": 0.7374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2559, "tokens_per_second_per_gpu": 17111.79, "total_tokens": 252755663 }, { "epoch": 0.16004001000250062, "grad_norm": 0.9212162494659424, "learning_rate": 2e-05, "loss": 0.7465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2560, "tokens_per_second_per_gpu": 17650.45, "total_tokens": 252858333 }, { "epoch": 0.16010252563140784, "grad_norm": 0.9602421522140503, "learning_rate": 2e-05, "loss": 0.7492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2561, "tokens_per_second_per_gpu": 18183.4, "total_tokens": 252957188 }, { "epoch": 0.16016504126031508, "grad_norm": 0.915571391582489, "learning_rate": 2e-05, "loss": 0.72, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2562, "tokens_per_second_per_gpu": 16644.26, "total_tokens": 253054281 }, { "epoch": 0.1602275568892223, "grad_norm": 1.009250283241272, "learning_rate": 2e-05, "loss": 0.7774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2563, "tokens_per_second_per_gpu": 17648.48, "total_tokens": 253156948 }, { "epoch": 0.16029007251812954, "grad_norm": 0.9212608337402344, "learning_rate": 2e-05, "loss": 0.7501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2564, "tokens_per_second_per_gpu": 18138.8, "total_tokens": 253256026 }, { "epoch": 0.16035258814703676, "grad_norm": 0.9426302909851074, "learning_rate": 2e-05, "loss": 0.6809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2565, "tokens_per_second_per_gpu": 17205.4, "total_tokens": 253356089 }, { "epoch": 0.16041510377594398, "grad_norm": 0.8801544904708862, "learning_rate": 2e-05, "loss": 0.7433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2566, "tokens_per_second_per_gpu": 18436.53, "total_tokens": 253459323 }, { "epoch": 0.16047761940485122, "grad_norm": 0.9430611729621887, "learning_rate": 2e-05, "loss": 0.703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2567, "tokens_per_second_per_gpu": 17300.23, "total_tokens": 253555713 }, { "epoch": 0.16054013503375844, "grad_norm": 0.9392822980880737, "learning_rate": 2e-05, "loss": 0.7502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2568, "tokens_per_second_per_gpu": 17634.79, "total_tokens": 253655206 }, { "epoch": 0.16060265066266566, "grad_norm": 0.918808102607727, "learning_rate": 2e-05, "loss": 0.7087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2569, "tokens_per_second_per_gpu": 17197.44, "total_tokens": 253755603 }, { "epoch": 0.1606651662915729, "grad_norm": 0.9170203804969788, "learning_rate": 2e-05, "loss": 0.7178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2570, "tokens_per_second_per_gpu": 17076.24, "total_tokens": 253853595 }, { "epoch": 0.16072768192048012, "grad_norm": 0.9139009714126587, "learning_rate": 2e-05, "loss": 0.7391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2571, "tokens_per_second_per_gpu": 16733.14, "total_tokens": 253951961 }, { "epoch": 0.16079019754938734, "grad_norm": 0.9368557333946228, "learning_rate": 2e-05, "loss": 0.7351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2572, "tokens_per_second_per_gpu": 17610.42, "total_tokens": 254047432 }, { "epoch": 0.16085271317829458, "grad_norm": 0.9010105133056641, "learning_rate": 2e-05, "loss": 0.7195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2573, "tokens_per_second_per_gpu": 18446.31, "total_tokens": 254149473 }, { "epoch": 0.1609152288072018, "grad_norm": 0.906936764717102, "learning_rate": 2e-05, "loss": 0.7298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2574, "tokens_per_second_per_gpu": 17482.1, "total_tokens": 254247837 }, { "epoch": 0.16097774443610902, "grad_norm": 0.8955718278884888, "learning_rate": 2e-05, "loss": 0.7203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2575, "tokens_per_second_per_gpu": 18243.8, "total_tokens": 254351047 }, { "epoch": 0.16104026006501626, "grad_norm": 0.8866704702377319, "learning_rate": 2e-05, "loss": 0.7078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2576, "tokens_per_second_per_gpu": 17092.38, "total_tokens": 254447953 }, { "epoch": 0.16110277569392348, "grad_norm": 0.9104113578796387, "learning_rate": 2e-05, "loss": 0.7436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2577, "tokens_per_second_per_gpu": 18079.15, "total_tokens": 254549243 }, { "epoch": 0.1611652913228307, "grad_norm": 0.89226895570755, "learning_rate": 2e-05, "loss": 0.7261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2578, "tokens_per_second_per_gpu": 17335.72, "total_tokens": 254648038 }, { "epoch": 0.16122780695173794, "grad_norm": 0.9040558934211731, "learning_rate": 2e-05, "loss": 0.7384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2579, "tokens_per_second_per_gpu": 17634.34, "total_tokens": 254747242 }, { "epoch": 0.16129032258064516, "grad_norm": 0.9236862659454346, "learning_rate": 2e-05, "loss": 0.7092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2580, "tokens_per_second_per_gpu": 15579.76, "total_tokens": 254839676 }, { "epoch": 0.1613528382095524, "grad_norm": 0.9402416944503784, "learning_rate": 2e-05, "loss": 0.7511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2581, "tokens_per_second_per_gpu": 17549.94, "total_tokens": 254936728 }, { "epoch": 0.16141535383845962, "grad_norm": 0.9362578988075256, "learning_rate": 2e-05, "loss": 0.7367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2582, "tokens_per_second_per_gpu": 16850.02, "total_tokens": 255032393 }, { "epoch": 0.16147786946736684, "grad_norm": 0.9583765864372253, "learning_rate": 2e-05, "loss": 0.7767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2583, "tokens_per_second_per_gpu": 17906.9, "total_tokens": 255129844 }, { "epoch": 0.16154038509627408, "grad_norm": 1.0032682418823242, "learning_rate": 2e-05, "loss": 0.7478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2584, "tokens_per_second_per_gpu": 16077.67, "total_tokens": 255216034 }, { "epoch": 0.1616029007251813, "grad_norm": 0.9109172821044922, "learning_rate": 2e-05, "loss": 0.7246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2585, "tokens_per_second_per_gpu": 16865.44, "total_tokens": 255316663 }, { "epoch": 0.16166541635408851, "grad_norm": 0.9312459230422974, "learning_rate": 2e-05, "loss": 0.7358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2586, "tokens_per_second_per_gpu": 17905.38, "total_tokens": 255417183 }, { "epoch": 0.16172793198299576, "grad_norm": 0.9024844169616699, "learning_rate": 2e-05, "loss": 0.7086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2587, "tokens_per_second_per_gpu": 17399.06, "total_tokens": 255515795 }, { "epoch": 0.16179044761190298, "grad_norm": 0.9107148051261902, "learning_rate": 2e-05, "loss": 0.7311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2588, "tokens_per_second_per_gpu": 17322.31, "total_tokens": 255612345 }, { "epoch": 0.1618529632408102, "grad_norm": 0.9896182417869568, "learning_rate": 2e-05, "loss": 0.6902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2589, "tokens_per_second_per_gpu": 16071.36, "total_tokens": 255704048 }, { "epoch": 0.16191547886971744, "grad_norm": 0.9853972792625427, "learning_rate": 2e-05, "loss": 0.743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2590, "tokens_per_second_per_gpu": 17376.88, "total_tokens": 255803885 }, { "epoch": 0.16197799449862466, "grad_norm": 0.8796993494033813, "learning_rate": 2e-05, "loss": 0.6739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2591, "tokens_per_second_per_gpu": 16772.29, "total_tokens": 255899266 }, { "epoch": 0.16204051012753187, "grad_norm": 1.0045362710952759, "learning_rate": 2e-05, "loss": 0.7436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2592, "tokens_per_second_per_gpu": 18127.66, "total_tokens": 255997662 }, { "epoch": 0.16210302575643912, "grad_norm": 0.9075884222984314, "learning_rate": 2e-05, "loss": 0.6992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2593, "tokens_per_second_per_gpu": 18046.79, "total_tokens": 256094730 }, { "epoch": 0.16216554138534633, "grad_norm": 0.8687986135482788, "learning_rate": 2e-05, "loss": 0.731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2594, "tokens_per_second_per_gpu": 18001.68, "total_tokens": 256197681 }, { "epoch": 0.16222805701425355, "grad_norm": 0.9279903173446655, "learning_rate": 2e-05, "loss": 0.7487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2595, "tokens_per_second_per_gpu": 17032.16, "total_tokens": 256295991 }, { "epoch": 0.1622905726431608, "grad_norm": 0.9400684237480164, "learning_rate": 2e-05, "loss": 0.7429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2596, "tokens_per_second_per_gpu": 17973.65, "total_tokens": 256393508 }, { "epoch": 0.162353088272068, "grad_norm": 0.9117801189422607, "learning_rate": 2e-05, "loss": 0.7389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2597, "tokens_per_second_per_gpu": 17531.99, "total_tokens": 256496110 }, { "epoch": 0.16241560390097523, "grad_norm": 0.9436708092689514, "learning_rate": 2e-05, "loss": 0.7279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2598, "tokens_per_second_per_gpu": 15314.75, "total_tokens": 256589184 }, { "epoch": 0.16247811952988248, "grad_norm": 0.888282060623169, "learning_rate": 2e-05, "loss": 0.7661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2599, "tokens_per_second_per_gpu": 17668.9, "total_tokens": 256691125 }, { "epoch": 0.1625406351587897, "grad_norm": 0.9345611333847046, "learning_rate": 2e-05, "loss": 0.7181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2600, "tokens_per_second_per_gpu": 17339.83, "total_tokens": 256786869 }, { "epoch": 0.16260315078769694, "grad_norm": 0.9194596409797668, "learning_rate": 2e-05, "loss": 0.7346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2601, "tokens_per_second_per_gpu": 17798.47, "total_tokens": 256889489 }, { "epoch": 0.16266566641660415, "grad_norm": 0.9028118252754211, "learning_rate": 2e-05, "loss": 0.6959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2602, "tokens_per_second_per_gpu": 18182.19, "total_tokens": 256988589 }, { "epoch": 0.16272818204551137, "grad_norm": 0.9039663076400757, "learning_rate": 2e-05, "loss": 0.759, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2603, "tokens_per_second_per_gpu": 15973.74, "total_tokens": 257086044 }, { "epoch": 0.16279069767441862, "grad_norm": 0.9797655940055847, "learning_rate": 2e-05, "loss": 0.7458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2604, "tokens_per_second_per_gpu": 16952.28, "total_tokens": 257182587 }, { "epoch": 0.16285321330332583, "grad_norm": 0.9411249160766602, "learning_rate": 2e-05, "loss": 0.766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2605, "tokens_per_second_per_gpu": 17659.19, "total_tokens": 257282040 }, { "epoch": 0.16291572893223305, "grad_norm": 0.8985130786895752, "learning_rate": 2e-05, "loss": 0.7378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2606, "tokens_per_second_per_gpu": 17314.12, "total_tokens": 257382751 }, { "epoch": 0.1629782445611403, "grad_norm": 0.9037193059921265, "learning_rate": 2e-05, "loss": 0.7374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2607, "tokens_per_second_per_gpu": 16583.42, "total_tokens": 257480297 }, { "epoch": 0.1630407601900475, "grad_norm": 0.9185985326766968, "learning_rate": 2e-05, "loss": 0.7445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2608, "tokens_per_second_per_gpu": 17731.41, "total_tokens": 257582034 }, { "epoch": 0.16310327581895473, "grad_norm": 0.9047990441322327, "learning_rate": 2e-05, "loss": 0.7653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2609, "tokens_per_second_per_gpu": 17544.2, "total_tokens": 257682819 }, { "epoch": 0.16316579144786197, "grad_norm": 0.8999179601669312, "learning_rate": 2e-05, "loss": 0.7681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2610, "tokens_per_second_per_gpu": 18794.92, "total_tokens": 257791142 }, { "epoch": 0.1632283070767692, "grad_norm": 0.9020719528198242, "learning_rate": 2e-05, "loss": 0.7113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2611, "tokens_per_second_per_gpu": 17089.76, "total_tokens": 257888933 }, { "epoch": 0.1632908227056764, "grad_norm": 0.9070421457290649, "learning_rate": 2e-05, "loss": 0.7435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2612, "tokens_per_second_per_gpu": 16553.99, "total_tokens": 257982756 }, { "epoch": 0.16335333833458365, "grad_norm": 0.8960455656051636, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2613, "tokens_per_second_per_gpu": 16485.13, "total_tokens": 258078664 }, { "epoch": 0.16341585396349087, "grad_norm": 0.8829322457313538, "learning_rate": 2e-05, "loss": 0.6815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2614, "tokens_per_second_per_gpu": 18045.76, "total_tokens": 258179537 }, { "epoch": 0.1634783695923981, "grad_norm": 0.9047417044639587, "learning_rate": 2e-05, "loss": 0.678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2615, "tokens_per_second_per_gpu": 16005.75, "total_tokens": 258276641 }, { "epoch": 0.16354088522130533, "grad_norm": 0.8825477361679077, "learning_rate": 2e-05, "loss": 0.717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2616, "tokens_per_second_per_gpu": 17257.06, "total_tokens": 258375831 }, { "epoch": 0.16360340085021255, "grad_norm": 0.9177144169807434, "learning_rate": 2e-05, "loss": 0.7354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2617, "tokens_per_second_per_gpu": 18045.4, "total_tokens": 258475779 }, { "epoch": 0.16366591647911977, "grad_norm": 0.9267773032188416, "learning_rate": 2e-05, "loss": 0.7597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2618, "tokens_per_second_per_gpu": 18767.72, "total_tokens": 258578591 }, { "epoch": 0.163728432108027, "grad_norm": 0.9414200186729431, "learning_rate": 2e-05, "loss": 0.7208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2619, "tokens_per_second_per_gpu": 17543.77, "total_tokens": 258674627 }, { "epoch": 0.16379094773693423, "grad_norm": 0.9330568313598633, "learning_rate": 2e-05, "loss": 0.7109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2620, "tokens_per_second_per_gpu": 17761.61, "total_tokens": 258770276 }, { "epoch": 0.16385346336584147, "grad_norm": 0.928149938583374, "learning_rate": 2e-05, "loss": 0.7306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2621, "tokens_per_second_per_gpu": 17166.49, "total_tokens": 258868478 }, { "epoch": 0.1639159789947487, "grad_norm": 0.9585267305374146, "learning_rate": 2e-05, "loss": 0.7406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2622, "tokens_per_second_per_gpu": 17654.36, "total_tokens": 258963660 }, { "epoch": 0.1639784946236559, "grad_norm": 0.9180729389190674, "learning_rate": 2e-05, "loss": 0.726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2623, "tokens_per_second_per_gpu": 16921.49, "total_tokens": 259061593 }, { "epoch": 0.16404101025256315, "grad_norm": 0.9199208617210388, "learning_rate": 2e-05, "loss": 0.7168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2624, "tokens_per_second_per_gpu": 17804.2, "total_tokens": 259156871 }, { "epoch": 0.16410352588147037, "grad_norm": 0.9432045817375183, "learning_rate": 2e-05, "loss": 0.7153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2625, "tokens_per_second_per_gpu": 17841.82, "total_tokens": 259254959 }, { "epoch": 0.16416604151037759, "grad_norm": 0.9212653040885925, "learning_rate": 2e-05, "loss": 0.7293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2626, "tokens_per_second_per_gpu": 17899.95, "total_tokens": 259354343 }, { "epoch": 0.16422855713928483, "grad_norm": 0.8776289224624634, "learning_rate": 2e-05, "loss": 0.7232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2627, "tokens_per_second_per_gpu": 16728.24, "total_tokens": 259453916 }, { "epoch": 0.16429107276819205, "grad_norm": 0.9325109124183655, "learning_rate": 2e-05, "loss": 0.749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2628, "tokens_per_second_per_gpu": 17624.3, "total_tokens": 259553923 }, { "epoch": 0.16435358839709927, "grad_norm": 0.9453989267349243, "learning_rate": 2e-05, "loss": 0.7433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2629, "tokens_per_second_per_gpu": 17259.6, "total_tokens": 259651816 }, { "epoch": 0.1644161040260065, "grad_norm": 0.8696309328079224, "learning_rate": 2e-05, "loss": 0.7229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2630, "tokens_per_second_per_gpu": 18121.48, "total_tokens": 259755322 }, { "epoch": 0.16447861965491373, "grad_norm": 0.9056925177574158, "learning_rate": 2e-05, "loss": 0.7214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2631, "tokens_per_second_per_gpu": 17305.44, "total_tokens": 259854928 }, { "epoch": 0.16454113528382094, "grad_norm": 0.9295607209205627, "learning_rate": 2e-05, "loss": 0.6999, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2632, "tokens_per_second_per_gpu": 15839.08, "total_tokens": 259953113 }, { "epoch": 0.1646036509127282, "grad_norm": 0.8880066275596619, "learning_rate": 2e-05, "loss": 0.7225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2633, "tokens_per_second_per_gpu": 17981.32, "total_tokens": 260052224 }, { "epoch": 0.1646661665416354, "grad_norm": 0.9222045540809631, "learning_rate": 2e-05, "loss": 0.7694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2634, "tokens_per_second_per_gpu": 17841.86, "total_tokens": 260152998 }, { "epoch": 0.16472868217054262, "grad_norm": 0.9470266699790955, "learning_rate": 2e-05, "loss": 0.7223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2635, "tokens_per_second_per_gpu": 16410.39, "total_tokens": 260250181 }, { "epoch": 0.16479119779944987, "grad_norm": 0.9276790022850037, "learning_rate": 2e-05, "loss": 0.7411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2636, "tokens_per_second_per_gpu": 17346.38, "total_tokens": 260348430 }, { "epoch": 0.16485371342835708, "grad_norm": 0.9484577775001526, "learning_rate": 2e-05, "loss": 0.7255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2637, "tokens_per_second_per_gpu": 17227.23, "total_tokens": 260445483 }, { "epoch": 0.16491622905726433, "grad_norm": 0.9433512687683105, "learning_rate": 2e-05, "loss": 0.7118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2638, "tokens_per_second_per_gpu": 16071.5, "total_tokens": 260539519 }, { "epoch": 0.16497874468617155, "grad_norm": 0.908018946647644, "learning_rate": 2e-05, "loss": 0.7018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2639, "tokens_per_second_per_gpu": 17098.29, "total_tokens": 260638704 }, { "epoch": 0.16504126031507876, "grad_norm": 0.9192203879356384, "learning_rate": 2e-05, "loss": 0.7365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2640, "tokens_per_second_per_gpu": 17648.59, "total_tokens": 260738577 }, { "epoch": 0.165103775943986, "grad_norm": 0.9219305515289307, "learning_rate": 2e-05, "loss": 0.7707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2641, "tokens_per_second_per_gpu": 17028.22, "total_tokens": 260837321 }, { "epoch": 0.16516629157289323, "grad_norm": 0.9277446269989014, "learning_rate": 2e-05, "loss": 0.7794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2642, "tokens_per_second_per_gpu": 18244.07, "total_tokens": 260940102 }, { "epoch": 0.16522880720180044, "grad_norm": 0.9130678176879883, "learning_rate": 2e-05, "loss": 0.719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2643, "tokens_per_second_per_gpu": 16223.33, "total_tokens": 261038969 }, { "epoch": 0.1652913228307077, "grad_norm": 0.9052194952964783, "learning_rate": 2e-05, "loss": 0.7152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2644, "tokens_per_second_per_gpu": 17278.92, "total_tokens": 261134981 }, { "epoch": 0.1653538384596149, "grad_norm": 0.9080877900123596, "learning_rate": 2e-05, "loss": 0.744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2645, "tokens_per_second_per_gpu": 16843.42, "total_tokens": 261233602 }, { "epoch": 0.16541635408852212, "grad_norm": 0.9760825634002686, "learning_rate": 2e-05, "loss": 0.7537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2646, "tokens_per_second_per_gpu": 16714.65, "total_tokens": 261333069 }, { "epoch": 0.16547886971742937, "grad_norm": 0.9600096344947815, "learning_rate": 2e-05, "loss": 0.7949, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2647, "tokens_per_second_per_gpu": 17989.61, "total_tokens": 261431689 }, { "epoch": 0.16554138534633658, "grad_norm": 0.9283392429351807, "learning_rate": 2e-05, "loss": 0.7411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2648, "tokens_per_second_per_gpu": 18104.69, "total_tokens": 261530944 }, { "epoch": 0.1656039009752438, "grad_norm": 0.9384970664978027, "learning_rate": 2e-05, "loss": 0.7194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2649, "tokens_per_second_per_gpu": 18210.48, "total_tokens": 261632932 }, { "epoch": 0.16566641660415105, "grad_norm": 0.9107702970504761, "learning_rate": 2e-05, "loss": 0.7794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2650, "tokens_per_second_per_gpu": 17770.53, "total_tokens": 261734213 }, { "epoch": 0.16572893223305826, "grad_norm": 0.9367051720619202, "learning_rate": 2e-05, "loss": 0.7012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2651, "tokens_per_second_per_gpu": 16286.26, "total_tokens": 261832869 }, { "epoch": 0.16579144786196548, "grad_norm": 0.907469630241394, "learning_rate": 2e-05, "loss": 0.7545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2652, "tokens_per_second_per_gpu": 18816.77, "total_tokens": 261938919 }, { "epoch": 0.16585396349087272, "grad_norm": 0.9350305199623108, "learning_rate": 2e-05, "loss": 0.7189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2653, "tokens_per_second_per_gpu": 17359.47, "total_tokens": 262038133 }, { "epoch": 0.16591647911977994, "grad_norm": 0.9088319540023804, "learning_rate": 2e-05, "loss": 0.6957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2654, "tokens_per_second_per_gpu": 17621.46, "total_tokens": 262135073 }, { "epoch": 0.16597899474868716, "grad_norm": 0.9419721364974976, "learning_rate": 2e-05, "loss": 0.7215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2655, "tokens_per_second_per_gpu": 18265.12, "total_tokens": 262232673 }, { "epoch": 0.1660415103775944, "grad_norm": 0.9574368000030518, "learning_rate": 2e-05, "loss": 0.7318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2656, "tokens_per_second_per_gpu": 16678.64, "total_tokens": 262331681 }, { "epoch": 0.16610402600650162, "grad_norm": 0.9196388125419617, "learning_rate": 2e-05, "loss": 0.7059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2657, "tokens_per_second_per_gpu": 17237.48, "total_tokens": 262429289 }, { "epoch": 0.16616654163540887, "grad_norm": 0.9004754424095154, "learning_rate": 2e-05, "loss": 0.7016, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2658, "tokens_per_second_per_gpu": 16908.44, "total_tokens": 262527199 }, { "epoch": 0.16622905726431608, "grad_norm": 0.9087395668029785, "learning_rate": 2e-05, "loss": 0.722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2659, "tokens_per_second_per_gpu": 17070.52, "total_tokens": 262625558 }, { "epoch": 0.1662915728932233, "grad_norm": 0.9390180110931396, "learning_rate": 2e-05, "loss": 0.7439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2660, "tokens_per_second_per_gpu": 16994.84, "total_tokens": 262722185 }, { "epoch": 0.16635408852213054, "grad_norm": 0.8964182734489441, "learning_rate": 2e-05, "loss": 0.7509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2661, "tokens_per_second_per_gpu": 17710.5, "total_tokens": 262824075 }, { "epoch": 0.16641660415103776, "grad_norm": 0.8924476504325867, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2662, "tokens_per_second_per_gpu": 17049.29, "total_tokens": 262921209 }, { "epoch": 0.16647911977994498, "grad_norm": 0.8922584652900696, "learning_rate": 2e-05, "loss": 0.6782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2663, "tokens_per_second_per_gpu": 17525.38, "total_tokens": 263020261 }, { "epoch": 0.16654163540885222, "grad_norm": 0.8789050579071045, "learning_rate": 2e-05, "loss": 0.7091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2664, "tokens_per_second_per_gpu": 17544.76, "total_tokens": 263119384 }, { "epoch": 0.16660415103775944, "grad_norm": 0.9235649704933167, "learning_rate": 2e-05, "loss": 0.7736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2665, "tokens_per_second_per_gpu": 18070.86, "total_tokens": 263222214 }, { "epoch": 0.16666666666666666, "grad_norm": 0.9076069593429565, "learning_rate": 2e-05, "loss": 0.7063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2666, "tokens_per_second_per_gpu": 17134.45, "total_tokens": 263323366 }, { "epoch": 0.1667291822955739, "grad_norm": 0.9080181121826172, "learning_rate": 2e-05, "loss": 0.7229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2667, "tokens_per_second_per_gpu": 15866.91, "total_tokens": 263419465 }, { "epoch": 0.16679169792448112, "grad_norm": 0.9190274477005005, "learning_rate": 2e-05, "loss": 0.7078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2668, "tokens_per_second_per_gpu": 17229.02, "total_tokens": 263517516 }, { "epoch": 0.16685421355338834, "grad_norm": 0.9211244583129883, "learning_rate": 2e-05, "loss": 0.7396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2669, "tokens_per_second_per_gpu": 17439.24, "total_tokens": 263618080 }, { "epoch": 0.16691672918229558, "grad_norm": 0.9172168374061584, "learning_rate": 2e-05, "loss": 0.7131, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2670, "tokens_per_second_per_gpu": 18119.45, "total_tokens": 263720512 }, { "epoch": 0.1669792448112028, "grad_norm": 0.9496929049491882, "learning_rate": 2e-05, "loss": 0.8012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2671, "tokens_per_second_per_gpu": 16205.4, "total_tokens": 263818266 }, { "epoch": 0.16704176044011002, "grad_norm": 0.9607657194137573, "learning_rate": 2e-05, "loss": 0.7156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2672, "tokens_per_second_per_gpu": 17495.69, "total_tokens": 263913537 }, { "epoch": 0.16710427606901726, "grad_norm": 0.9180126786231995, "learning_rate": 2e-05, "loss": 0.7189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2673, "tokens_per_second_per_gpu": 18159.22, "total_tokens": 264017506 }, { "epoch": 0.16716679169792448, "grad_norm": 0.8802446722984314, "learning_rate": 2e-05, "loss": 0.7073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2674, "tokens_per_second_per_gpu": 17270.21, "total_tokens": 264117483 }, { "epoch": 0.1672293073268317, "grad_norm": 0.8869173526763916, "learning_rate": 2e-05, "loss": 0.6908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2675, "tokens_per_second_per_gpu": 17810.73, "total_tokens": 264220061 }, { "epoch": 0.16729182295573894, "grad_norm": 0.9320928454399109, "learning_rate": 2e-05, "loss": 0.7047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2676, "tokens_per_second_per_gpu": 17334.22, "total_tokens": 264318446 }, { "epoch": 0.16735433858464616, "grad_norm": 0.9915482997894287, "learning_rate": 2e-05, "loss": 0.7122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2677, "tokens_per_second_per_gpu": 17545.53, "total_tokens": 264418745 }, { "epoch": 0.1674168542135534, "grad_norm": 0.8843052387237549, "learning_rate": 2e-05, "loss": 0.7364, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2678, "tokens_per_second_per_gpu": 17514.09, "total_tokens": 264518720 }, { "epoch": 0.16747936984246062, "grad_norm": 0.8954319953918457, "learning_rate": 2e-05, "loss": 0.7371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2679, "tokens_per_second_per_gpu": 17960.69, "total_tokens": 264618881 }, { "epoch": 0.16754188547136784, "grad_norm": 0.8918929696083069, "learning_rate": 2e-05, "loss": 0.7302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2680, "tokens_per_second_per_gpu": 18085.73, "total_tokens": 264722816 }, { "epoch": 0.16760440110027508, "grad_norm": 0.9602476954460144, "learning_rate": 2e-05, "loss": 0.687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2681, "tokens_per_second_per_gpu": 16150.65, "total_tokens": 264817731 }, { "epoch": 0.1676669167291823, "grad_norm": 0.9795159101486206, "learning_rate": 2e-05, "loss": 0.7746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2682, "tokens_per_second_per_gpu": 17497.23, "total_tokens": 264913488 }, { "epoch": 0.16772943235808951, "grad_norm": 0.9378470778465271, "learning_rate": 2e-05, "loss": 0.726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2683, "tokens_per_second_per_gpu": 17336.93, "total_tokens": 265012637 }, { "epoch": 0.16779194798699676, "grad_norm": 0.9726511240005493, "learning_rate": 2e-05, "loss": 0.7448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2684, "tokens_per_second_per_gpu": 16423.69, "total_tokens": 265108863 }, { "epoch": 0.16785446361590398, "grad_norm": 0.9664660096168518, "learning_rate": 2e-05, "loss": 0.7631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2685, "tokens_per_second_per_gpu": 17745.76, "total_tokens": 265210302 }, { "epoch": 0.1679169792448112, "grad_norm": 0.9143555164337158, "learning_rate": 2e-05, "loss": 0.7629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2686, "tokens_per_second_per_gpu": 18426.56, "total_tokens": 265313544 }, { "epoch": 0.16797949487371844, "grad_norm": 0.9137793183326721, "learning_rate": 2e-05, "loss": 0.7286, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2687, "tokens_per_second_per_gpu": 18434.74, "total_tokens": 265415075 }, { "epoch": 0.16804201050262565, "grad_norm": 0.9643433690071106, "learning_rate": 2e-05, "loss": 0.7145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2688, "tokens_per_second_per_gpu": 16333.05, "total_tokens": 265511237 }, { "epoch": 0.16810452613153287, "grad_norm": 0.9575275182723999, "learning_rate": 2e-05, "loss": 0.7706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2689, "tokens_per_second_per_gpu": 17188.69, "total_tokens": 265612525 }, { "epoch": 0.16816704176044012, "grad_norm": 0.9123136401176453, "learning_rate": 2e-05, "loss": 0.7043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2690, "tokens_per_second_per_gpu": 16848.34, "total_tokens": 265709446 }, { "epoch": 0.16822955738934733, "grad_norm": 0.9354833960533142, "learning_rate": 2e-05, "loss": 0.6902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2691, "tokens_per_second_per_gpu": 15863.16, "total_tokens": 265800972 }, { "epoch": 0.16829207301825455, "grad_norm": 0.9265931248664856, "learning_rate": 2e-05, "loss": 0.7106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2692, "tokens_per_second_per_gpu": 16480.28, "total_tokens": 265895488 }, { "epoch": 0.1683545886471618, "grad_norm": 0.924310564994812, "learning_rate": 2e-05, "loss": 0.6775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2693, "tokens_per_second_per_gpu": 17841.39, "total_tokens": 265993237 }, { "epoch": 0.168417104276069, "grad_norm": 0.8518446683883667, "learning_rate": 2e-05, "loss": 0.7196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2694, "tokens_per_second_per_gpu": 18582.2, "total_tokens": 266098900 }, { "epoch": 0.16847961990497626, "grad_norm": 0.9309942126274109, "learning_rate": 2e-05, "loss": 0.7197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2695, "tokens_per_second_per_gpu": 17504.75, "total_tokens": 266197706 }, { "epoch": 0.16854213553388347, "grad_norm": 0.9504106640815735, "learning_rate": 2e-05, "loss": 0.69, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2696, "tokens_per_second_per_gpu": 14735.62, "total_tokens": 266293141 }, { "epoch": 0.1686046511627907, "grad_norm": 0.9593204259872437, "learning_rate": 2e-05, "loss": 0.7413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2697, "tokens_per_second_per_gpu": 16894.82, "total_tokens": 266388508 }, { "epoch": 0.16866716679169794, "grad_norm": 0.8929693698883057, "learning_rate": 2e-05, "loss": 0.7035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2698, "tokens_per_second_per_gpu": 17634.77, "total_tokens": 266488544 }, { "epoch": 0.16872968242060515, "grad_norm": 0.921454131603241, "learning_rate": 2e-05, "loss": 0.7233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2699, "tokens_per_second_per_gpu": 17881.04, "total_tokens": 266590842 }, { "epoch": 0.16879219804951237, "grad_norm": 0.9201894402503967, "learning_rate": 2e-05, "loss": 0.7114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2700, "tokens_per_second_per_gpu": 16536.07, "total_tokens": 266688521 }, { "epoch": 0.16885471367841962, "grad_norm": 0.9535343647003174, "learning_rate": 2e-05, "loss": 0.7842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2701, "tokens_per_second_per_gpu": 16567.42, "total_tokens": 266786865 }, { "epoch": 0.16891722930732683, "grad_norm": 0.8827305436134338, "learning_rate": 2e-05, "loss": 0.7163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2702, "tokens_per_second_per_gpu": 17842.84, "total_tokens": 266886986 }, { "epoch": 0.16897974493623405, "grad_norm": 0.917460024356842, "learning_rate": 2e-05, "loss": 0.7319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2703, "tokens_per_second_per_gpu": 17571.26, "total_tokens": 266986171 }, { "epoch": 0.1690422605651413, "grad_norm": 0.9203165173530579, "learning_rate": 2e-05, "loss": 0.6845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2704, "tokens_per_second_per_gpu": 16315.89, "total_tokens": 267081874 }, { "epoch": 0.1691047761940485, "grad_norm": 0.907627522945404, "learning_rate": 2e-05, "loss": 0.7072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2705, "tokens_per_second_per_gpu": 17332.59, "total_tokens": 267179335 }, { "epoch": 0.16916729182295573, "grad_norm": 0.9095898866653442, "learning_rate": 2e-05, "loss": 0.6947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2706, "tokens_per_second_per_gpu": 16621.8, "total_tokens": 267277115 }, { "epoch": 0.16922980745186297, "grad_norm": 0.9110832810401917, "learning_rate": 2e-05, "loss": 0.7299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2707, "tokens_per_second_per_gpu": 18122.26, "total_tokens": 267380664 }, { "epoch": 0.1692923230807702, "grad_norm": 0.9431723356246948, "learning_rate": 2e-05, "loss": 0.7331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2708, "tokens_per_second_per_gpu": 15858.51, "total_tokens": 267477337 }, { "epoch": 0.1693548387096774, "grad_norm": 0.8831946849822998, "learning_rate": 2e-05, "loss": 0.7601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2709, "tokens_per_second_per_gpu": 17258.62, "total_tokens": 267578313 }, { "epoch": 0.16941735433858465, "grad_norm": 0.9694076180458069, "learning_rate": 2e-05, "loss": 0.7681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2710, "tokens_per_second_per_gpu": 18132.29, "total_tokens": 267680314 }, { "epoch": 0.16947986996749187, "grad_norm": 0.9146000742912292, "learning_rate": 2e-05, "loss": 0.7142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2711, "tokens_per_second_per_gpu": 17370.13, "total_tokens": 267780865 }, { "epoch": 0.1695423855963991, "grad_norm": 0.8834778070449829, "learning_rate": 2e-05, "loss": 0.7355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2712, "tokens_per_second_per_gpu": 16976.74, "total_tokens": 267880315 }, { "epoch": 0.16960490122530633, "grad_norm": 0.9062361717224121, "learning_rate": 2e-05, "loss": 0.7189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2713, "tokens_per_second_per_gpu": 16559.29, "total_tokens": 267980192 }, { "epoch": 0.16966741685421355, "grad_norm": 0.9471052885055542, "learning_rate": 2e-05, "loss": 0.7346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2714, "tokens_per_second_per_gpu": 15785.35, "total_tokens": 268073008 }, { "epoch": 0.1697299324831208, "grad_norm": 0.9159033298492432, "learning_rate": 2e-05, "loss": 0.7457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2715, "tokens_per_second_per_gpu": 17998.25, "total_tokens": 268173104 }, { "epoch": 0.169792448112028, "grad_norm": 0.9074848294258118, "learning_rate": 2e-05, "loss": 0.7146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2716, "tokens_per_second_per_gpu": 17899.82, "total_tokens": 268272263 }, { "epoch": 0.16985496374093523, "grad_norm": 0.9329647421836853, "learning_rate": 2e-05, "loss": 0.741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2717, "tokens_per_second_per_gpu": 15811.8, "total_tokens": 268366578 }, { "epoch": 0.16991747936984247, "grad_norm": 0.9262526035308838, "learning_rate": 2e-05, "loss": 0.7647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2718, "tokens_per_second_per_gpu": 16845.0, "total_tokens": 268466040 }, { "epoch": 0.1699799949987497, "grad_norm": 0.9181159138679504, "learning_rate": 2e-05, "loss": 0.7845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2719, "tokens_per_second_per_gpu": 17414.55, "total_tokens": 268565872 }, { "epoch": 0.1700425106276569, "grad_norm": 0.9423139095306396, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2720, "tokens_per_second_per_gpu": 16249.91, "total_tokens": 268661799 }, { "epoch": 0.17010502625656415, "grad_norm": 0.9230812191963196, "learning_rate": 2e-05, "loss": 0.7305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2721, "tokens_per_second_per_gpu": 18437.44, "total_tokens": 268764759 }, { "epoch": 0.17016754188547137, "grad_norm": 0.9299038052558899, "learning_rate": 2e-05, "loss": 0.7335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2722, "tokens_per_second_per_gpu": 16713.94, "total_tokens": 268860938 }, { "epoch": 0.17023005751437859, "grad_norm": 0.9060327410697937, "learning_rate": 2e-05, "loss": 0.7304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2723, "tokens_per_second_per_gpu": 18772.84, "total_tokens": 268962879 }, { "epoch": 0.17029257314328583, "grad_norm": 0.9891898036003113, "learning_rate": 2e-05, "loss": 0.7366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2724, "tokens_per_second_per_gpu": 17077.81, "total_tokens": 269061316 }, { "epoch": 0.17035508877219305, "grad_norm": 0.9204680919647217, "learning_rate": 2e-05, "loss": 0.7871, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2725, "tokens_per_second_per_gpu": 18568.64, "total_tokens": 269164313 }, { "epoch": 0.17041760440110026, "grad_norm": 0.9765559434890747, "learning_rate": 2e-05, "loss": 0.7093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2726, "tokens_per_second_per_gpu": 17051.52, "total_tokens": 269260156 }, { "epoch": 0.1704801200300075, "grad_norm": 0.9354212284088135, "learning_rate": 2e-05, "loss": 0.7645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2727, "tokens_per_second_per_gpu": 17059.83, "total_tokens": 269359832 }, { "epoch": 0.17054263565891473, "grad_norm": 0.9823940396308899, "learning_rate": 2e-05, "loss": 0.7394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2728, "tokens_per_second_per_gpu": 17550.5, "total_tokens": 269457139 }, { "epoch": 0.17060515128782194, "grad_norm": 0.9810733199119568, "learning_rate": 2e-05, "loss": 0.6851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2729, "tokens_per_second_per_gpu": 17802.65, "total_tokens": 269549277 }, { "epoch": 0.1706676669167292, "grad_norm": 0.9727417230606079, "learning_rate": 2e-05, "loss": 0.7607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2730, "tokens_per_second_per_gpu": 17106.6, "total_tokens": 269645310 }, { "epoch": 0.1707301825456364, "grad_norm": 0.9546655416488647, "learning_rate": 2e-05, "loss": 0.7066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2731, "tokens_per_second_per_gpu": 16817.21, "total_tokens": 269740241 }, { "epoch": 0.17079269817454362, "grad_norm": 0.9357644319534302, "learning_rate": 2e-05, "loss": 0.7284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2732, "tokens_per_second_per_gpu": 16988.88, "total_tokens": 269834684 }, { "epoch": 0.17085521380345087, "grad_norm": 0.9676441550254822, "learning_rate": 2e-05, "loss": 0.7527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2733, "tokens_per_second_per_gpu": 17071.61, "total_tokens": 269933112 }, { "epoch": 0.17091772943235808, "grad_norm": 0.9169116616249084, "learning_rate": 2e-05, "loss": 0.7958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2734, "tokens_per_second_per_gpu": 17100.57, "total_tokens": 270035022 }, { "epoch": 0.17098024506126533, "grad_norm": 0.9605956077575684, "learning_rate": 2e-05, "loss": 0.7751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2735, "tokens_per_second_per_gpu": 18511.95, "total_tokens": 270136686 }, { "epoch": 0.17104276069017255, "grad_norm": 0.9248852133750916, "learning_rate": 2e-05, "loss": 0.7642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2736, "tokens_per_second_per_gpu": 18017.57, "total_tokens": 270239441 }, { "epoch": 0.17110527631907976, "grad_norm": 0.9244498610496521, "learning_rate": 2e-05, "loss": 0.7568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2737, "tokens_per_second_per_gpu": 16468.34, "total_tokens": 270336504 }, { "epoch": 0.171167791947987, "grad_norm": 0.9871739745140076, "learning_rate": 2e-05, "loss": 0.7498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2738, "tokens_per_second_per_gpu": 17263.83, "total_tokens": 270436889 }, { "epoch": 0.17123030757689423, "grad_norm": 0.8765407204627991, "learning_rate": 2e-05, "loss": 0.7221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2739, "tokens_per_second_per_gpu": 17773.88, "total_tokens": 270539714 }, { "epoch": 0.17129282320580144, "grad_norm": 0.9301278591156006, "learning_rate": 2e-05, "loss": 0.707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2740, "tokens_per_second_per_gpu": 16916.94, "total_tokens": 270640427 }, { "epoch": 0.1713553388347087, "grad_norm": 0.8751048445701599, "learning_rate": 2e-05, "loss": 0.701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2741, "tokens_per_second_per_gpu": 17966.6, "total_tokens": 270739776 }, { "epoch": 0.1714178544636159, "grad_norm": 0.925272524356842, "learning_rate": 2e-05, "loss": 0.7373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2742, "tokens_per_second_per_gpu": 16594.69, "total_tokens": 270836072 }, { "epoch": 0.17148037009252312, "grad_norm": 0.9117201566696167, "learning_rate": 2e-05, "loss": 0.6981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2743, "tokens_per_second_per_gpu": 16740.68, "total_tokens": 270931995 }, { "epoch": 0.17154288572143037, "grad_norm": 0.9235600233078003, "learning_rate": 2e-05, "loss": 0.7335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2744, "tokens_per_second_per_gpu": 17662.94, "total_tokens": 271031709 }, { "epoch": 0.17160540135033758, "grad_norm": 0.8960293531417847, "learning_rate": 2e-05, "loss": 0.6866, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2745, "tokens_per_second_per_gpu": 17025.78, "total_tokens": 271129148 }, { "epoch": 0.1716679169792448, "grad_norm": 0.9174222350120544, "learning_rate": 2e-05, "loss": 0.754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2746, "tokens_per_second_per_gpu": 17929.09, "total_tokens": 271229658 }, { "epoch": 0.17173043260815204, "grad_norm": 0.9573174118995667, "learning_rate": 2e-05, "loss": 0.7587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2747, "tokens_per_second_per_gpu": 17426.14, "total_tokens": 271328873 }, { "epoch": 0.17179294823705926, "grad_norm": 0.8930690288543701, "learning_rate": 2e-05, "loss": 0.7212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2748, "tokens_per_second_per_gpu": 18150.95, "total_tokens": 271429315 }, { "epoch": 0.17185546386596648, "grad_norm": 0.9051510691642761, "learning_rate": 2e-05, "loss": 0.6929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2749, "tokens_per_second_per_gpu": 16095.56, "total_tokens": 271522750 }, { "epoch": 0.17191797949487372, "grad_norm": 0.9344185590744019, "learning_rate": 2e-05, "loss": 0.7847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2750, "tokens_per_second_per_gpu": 17956.91, "total_tokens": 271625437 }, { "epoch": 0.17198049512378094, "grad_norm": 0.9235514998435974, "learning_rate": 2e-05, "loss": 0.7935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2751, "tokens_per_second_per_gpu": 18507.57, "total_tokens": 271729499 }, { "epoch": 0.17204301075268819, "grad_norm": 0.9929126501083374, "learning_rate": 2e-05, "loss": 0.7418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2752, "tokens_per_second_per_gpu": 17571.88, "total_tokens": 271826729 }, { "epoch": 0.1721055263815954, "grad_norm": 0.959722101688385, "learning_rate": 2e-05, "loss": 0.7451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2753, "tokens_per_second_per_gpu": 16761.09, "total_tokens": 271923438 }, { "epoch": 0.17216804201050262, "grad_norm": 0.8743822574615479, "learning_rate": 2e-05, "loss": 0.7473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2754, "tokens_per_second_per_gpu": 18368.34, "total_tokens": 272026353 }, { "epoch": 0.17223055763940986, "grad_norm": 0.8786420226097107, "learning_rate": 2e-05, "loss": 0.7566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2755, "tokens_per_second_per_gpu": 18204.62, "total_tokens": 272128171 }, { "epoch": 0.17229307326831708, "grad_norm": 0.9905439615249634, "learning_rate": 2e-05, "loss": 0.7184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2756, "tokens_per_second_per_gpu": 16904.74, "total_tokens": 272226770 }, { "epoch": 0.1723555888972243, "grad_norm": 0.9330574870109558, "learning_rate": 2e-05, "loss": 0.7212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2757, "tokens_per_second_per_gpu": 17228.95, "total_tokens": 272324100 }, { "epoch": 0.17241810452613154, "grad_norm": 0.8950364589691162, "learning_rate": 2e-05, "loss": 0.7421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2758, "tokens_per_second_per_gpu": 18921.13, "total_tokens": 272430759 }, { "epoch": 0.17248062015503876, "grad_norm": 0.9251489043235779, "learning_rate": 2e-05, "loss": 0.7386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2759, "tokens_per_second_per_gpu": 17921.13, "total_tokens": 272530208 }, { "epoch": 0.17254313578394598, "grad_norm": 0.934929609298706, "learning_rate": 2e-05, "loss": 0.7192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2760, "tokens_per_second_per_gpu": 17001.82, "total_tokens": 272625484 }, { "epoch": 0.17260565141285322, "grad_norm": 0.9521545171737671, "learning_rate": 2e-05, "loss": 0.7349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2761, "tokens_per_second_per_gpu": 18037.52, "total_tokens": 272720311 }, { "epoch": 0.17266816704176044, "grad_norm": 0.9678106904029846, "learning_rate": 2e-05, "loss": 0.7406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2762, "tokens_per_second_per_gpu": 17328.93, "total_tokens": 272819896 }, { "epoch": 0.17273068267066766, "grad_norm": 0.9025922417640686, "learning_rate": 2e-05, "loss": 0.7049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2763, "tokens_per_second_per_gpu": 16679.25, "total_tokens": 272919127 }, { "epoch": 0.1727931982995749, "grad_norm": 0.8980532288551331, "learning_rate": 2e-05, "loss": 0.7262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2764, "tokens_per_second_per_gpu": 17160.5, "total_tokens": 273018049 }, { "epoch": 0.17285571392848212, "grad_norm": 0.9994246959686279, "learning_rate": 2e-05, "loss": 0.7243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2765, "tokens_per_second_per_gpu": 17234.52, "total_tokens": 273118208 }, { "epoch": 0.17291822955738934, "grad_norm": 0.9358934760093689, "learning_rate": 2e-05, "loss": 0.7247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2766, "tokens_per_second_per_gpu": 17687.01, "total_tokens": 273219249 }, { "epoch": 0.17298074518629658, "grad_norm": 0.9668897986412048, "learning_rate": 2e-05, "loss": 0.7668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2767, "tokens_per_second_per_gpu": 17348.73, "total_tokens": 273315630 }, { "epoch": 0.1730432608152038, "grad_norm": 0.9953017830848694, "learning_rate": 2e-05, "loss": 0.7292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2768, "tokens_per_second_per_gpu": 16672.19, "total_tokens": 273411648 }, { "epoch": 0.17310577644411101, "grad_norm": 0.9217242002487183, "learning_rate": 2e-05, "loss": 0.6924, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2769, "tokens_per_second_per_gpu": 17857.88, "total_tokens": 273509553 }, { "epoch": 0.17316829207301826, "grad_norm": 0.9064300656318665, "learning_rate": 2e-05, "loss": 0.7043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2770, "tokens_per_second_per_gpu": 17481.74, "total_tokens": 273610550 }, { "epoch": 0.17323080770192548, "grad_norm": 0.9106606245040894, "learning_rate": 2e-05, "loss": 0.7483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2771, "tokens_per_second_per_gpu": 18102.26, "total_tokens": 273714107 }, { "epoch": 0.17329332333083272, "grad_norm": 1.060757040977478, "learning_rate": 2e-05, "loss": 0.72, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2772, "tokens_per_second_per_gpu": 17352.35, "total_tokens": 273813768 }, { "epoch": 0.17335583895973994, "grad_norm": 0.9667404890060425, "learning_rate": 2e-05, "loss": 0.7711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2773, "tokens_per_second_per_gpu": 18034.36, "total_tokens": 273916425 }, { "epoch": 0.17341835458864716, "grad_norm": 0.936884880065918, "learning_rate": 2e-05, "loss": 0.7086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2774, "tokens_per_second_per_gpu": 16900.46, "total_tokens": 274013464 }, { "epoch": 0.1734808702175544, "grad_norm": 0.9113118648529053, "learning_rate": 2e-05, "loss": 0.697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2775, "tokens_per_second_per_gpu": 16847.73, "total_tokens": 274110772 }, { "epoch": 0.17354338584646162, "grad_norm": 0.9281507134437561, "learning_rate": 2e-05, "loss": 0.7053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2776, "tokens_per_second_per_gpu": 16487.37, "total_tokens": 274208572 }, { "epoch": 0.17360590147536883, "grad_norm": 0.9574482440948486, "learning_rate": 2e-05, "loss": 0.6836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2777, "tokens_per_second_per_gpu": 16536.16, "total_tokens": 274303096 }, { "epoch": 0.17366841710427608, "grad_norm": 0.9441009163856506, "learning_rate": 2e-05, "loss": 0.7363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2778, "tokens_per_second_per_gpu": 17338.99, "total_tokens": 274401370 }, { "epoch": 0.1737309327331833, "grad_norm": 0.9470068216323853, "learning_rate": 2e-05, "loss": 0.7677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2779, "tokens_per_second_per_gpu": 17580.33, "total_tokens": 274502325 }, { "epoch": 0.1737934483620905, "grad_norm": 0.9721143841743469, "learning_rate": 2e-05, "loss": 0.7438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2780, "tokens_per_second_per_gpu": 16989.27, "total_tokens": 274598655 }, { "epoch": 0.17385596399099776, "grad_norm": 0.9095749258995056, "learning_rate": 2e-05, "loss": 0.7021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2781, "tokens_per_second_per_gpu": 16635.5, "total_tokens": 274696633 }, { "epoch": 0.17391847961990498, "grad_norm": 0.8896320462226868, "learning_rate": 2e-05, "loss": 0.7146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2782, "tokens_per_second_per_gpu": 17235.95, "total_tokens": 274797446 }, { "epoch": 0.1739809952488122, "grad_norm": 0.9330723881721497, "learning_rate": 2e-05, "loss": 0.7574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2783, "tokens_per_second_per_gpu": 17100.1, "total_tokens": 274897515 }, { "epoch": 0.17404351087771944, "grad_norm": 0.9201729893684387, "learning_rate": 2e-05, "loss": 0.7053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2784, "tokens_per_second_per_gpu": 16965.19, "total_tokens": 274995140 }, { "epoch": 0.17410602650662665, "grad_norm": 0.951765775680542, "learning_rate": 2e-05, "loss": 0.7127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2785, "tokens_per_second_per_gpu": 17903.22, "total_tokens": 275091437 }, { "epoch": 0.17416854213553387, "grad_norm": 0.9369325041770935, "learning_rate": 2e-05, "loss": 0.7032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2786, "tokens_per_second_per_gpu": 17275.75, "total_tokens": 275191389 }, { "epoch": 0.17423105776444112, "grad_norm": 0.898357629776001, "learning_rate": 2e-05, "loss": 0.76, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2787, "tokens_per_second_per_gpu": 18590.28, "total_tokens": 275293703 }, { "epoch": 0.17429357339334833, "grad_norm": 0.9055443406105042, "learning_rate": 2e-05, "loss": 0.724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2788, "tokens_per_second_per_gpu": 17444.55, "total_tokens": 275393567 }, { "epoch": 0.17435608902225555, "grad_norm": 0.9483789205551147, "learning_rate": 2e-05, "loss": 0.7416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2789, "tokens_per_second_per_gpu": 16972.52, "total_tokens": 275491379 }, { "epoch": 0.1744186046511628, "grad_norm": 0.9221799969673157, "learning_rate": 2e-05, "loss": 0.7537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2790, "tokens_per_second_per_gpu": 18368.85, "total_tokens": 275596521 }, { "epoch": 0.17448112028007, "grad_norm": 0.9018255472183228, "learning_rate": 2e-05, "loss": 0.7215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2791, "tokens_per_second_per_gpu": 17576.89, "total_tokens": 275696434 }, { "epoch": 0.17454363590897726, "grad_norm": 0.9172285199165344, "learning_rate": 2e-05, "loss": 0.7395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2792, "tokens_per_second_per_gpu": 16945.13, "total_tokens": 275796261 }, { "epoch": 0.17460615153788447, "grad_norm": 0.9227660298347473, "learning_rate": 2e-05, "loss": 0.725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2793, "tokens_per_second_per_gpu": 17803.61, "total_tokens": 275897681 }, { "epoch": 0.1746686671667917, "grad_norm": 0.9103137254714966, "learning_rate": 2e-05, "loss": 0.7419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2794, "tokens_per_second_per_gpu": 16402.81, "total_tokens": 275996140 }, { "epoch": 0.17473118279569894, "grad_norm": 0.9473270177841187, "learning_rate": 2e-05, "loss": 0.7263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2795, "tokens_per_second_per_gpu": 16784.05, "total_tokens": 276093720 }, { "epoch": 0.17479369842460615, "grad_norm": 0.8976336717605591, "learning_rate": 2e-05, "loss": 0.7368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2796, "tokens_per_second_per_gpu": 17743.49, "total_tokens": 276192957 }, { "epoch": 0.17485621405351337, "grad_norm": 0.8954249024391174, "learning_rate": 2e-05, "loss": 0.6667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2797, "tokens_per_second_per_gpu": 17735.93, "total_tokens": 276292957 }, { "epoch": 0.17491872968242062, "grad_norm": 0.9136927127838135, "learning_rate": 2e-05, "loss": 0.752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2798, "tokens_per_second_per_gpu": 16843.12, "total_tokens": 276394275 }, { "epoch": 0.17498124531132783, "grad_norm": 0.9354670643806458, "learning_rate": 2e-05, "loss": 0.733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2799, "tokens_per_second_per_gpu": 17768.86, "total_tokens": 276493671 }, { "epoch": 0.17504376094023505, "grad_norm": 0.8770763278007507, "learning_rate": 2e-05, "loss": 0.7409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2800, "tokens_per_second_per_gpu": 18732.77, "total_tokens": 276597021 }, { "epoch": 0.1751062765691423, "grad_norm": 0.9179648160934448, "learning_rate": 2e-05, "loss": 0.7238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2801, "tokens_per_second_per_gpu": 16896.75, "total_tokens": 276692636 }, { "epoch": 0.1751687921980495, "grad_norm": 0.9958091378211975, "learning_rate": 2e-05, "loss": 0.764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2802, "tokens_per_second_per_gpu": 17028.82, "total_tokens": 276793382 }, { "epoch": 0.17523130782695673, "grad_norm": 0.9230484962463379, "learning_rate": 2e-05, "loss": 0.7143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2803, "tokens_per_second_per_gpu": 17524.38, "total_tokens": 276890864 }, { "epoch": 0.17529382345586397, "grad_norm": 0.8825454711914062, "learning_rate": 2e-05, "loss": 0.7661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2804, "tokens_per_second_per_gpu": 18290.32, "total_tokens": 276990744 }, { "epoch": 0.1753563390847712, "grad_norm": 0.8767498731613159, "learning_rate": 2e-05, "loss": 0.6832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2805, "tokens_per_second_per_gpu": 15423.17, "total_tokens": 277087697 }, { "epoch": 0.1754188547136784, "grad_norm": 0.875377357006073, "learning_rate": 2e-05, "loss": 0.6999, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2806, "tokens_per_second_per_gpu": 16684.54, "total_tokens": 277186484 }, { "epoch": 0.17548137034258565, "grad_norm": 0.9746974110603333, "learning_rate": 2e-05, "loss": 0.7831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2807, "tokens_per_second_per_gpu": 17153.07, "total_tokens": 277287649 }, { "epoch": 0.17554388597149287, "grad_norm": 0.8955265879631042, "learning_rate": 2e-05, "loss": 0.7488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2808, "tokens_per_second_per_gpu": 17887.65, "total_tokens": 277390033 }, { "epoch": 0.1756064016004001, "grad_norm": 0.8918383717536926, "learning_rate": 2e-05, "loss": 0.6953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2809, "tokens_per_second_per_gpu": 17177.85, "total_tokens": 277487450 }, { "epoch": 0.17566891722930733, "grad_norm": 0.919098436832428, "learning_rate": 2e-05, "loss": 0.708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2810, "tokens_per_second_per_gpu": 17928.38, "total_tokens": 277585846 }, { "epoch": 0.17573143285821455, "grad_norm": 0.9535427689552307, "learning_rate": 2e-05, "loss": 0.729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2811, "tokens_per_second_per_gpu": 17744.61, "total_tokens": 277682802 }, { "epoch": 0.1757939484871218, "grad_norm": 0.9170882701873779, "learning_rate": 2e-05, "loss": 0.7997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2812, "tokens_per_second_per_gpu": 18610.27, "total_tokens": 277789337 }, { "epoch": 0.175856464116029, "grad_norm": 0.9274720549583435, "learning_rate": 2e-05, "loss": 0.719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2813, "tokens_per_second_per_gpu": 17065.94, "total_tokens": 277885682 }, { "epoch": 0.17591897974493623, "grad_norm": 0.9432968497276306, "learning_rate": 2e-05, "loss": 0.6914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2814, "tokens_per_second_per_gpu": 17726.04, "total_tokens": 277980615 }, { "epoch": 0.17598149537384347, "grad_norm": 0.9605250358581543, "learning_rate": 2e-05, "loss": 0.6834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2815, "tokens_per_second_per_gpu": 16813.28, "total_tokens": 278075576 }, { "epoch": 0.1760440110027507, "grad_norm": 0.8920073509216309, "learning_rate": 2e-05, "loss": 0.7017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2816, "tokens_per_second_per_gpu": 17861.29, "total_tokens": 278175360 }, { "epoch": 0.1761065266316579, "grad_norm": 0.9644456505775452, "learning_rate": 2e-05, "loss": 0.7427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2817, "tokens_per_second_per_gpu": 17236.46, "total_tokens": 278276459 }, { "epoch": 0.17616904226056515, "grad_norm": 0.9336052536964417, "learning_rate": 2e-05, "loss": 0.7191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2818, "tokens_per_second_per_gpu": 16702.44, "total_tokens": 278375110 }, { "epoch": 0.17623155788947237, "grad_norm": 1.015062689781189, "learning_rate": 2e-05, "loss": 0.7593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2819, "tokens_per_second_per_gpu": 17536.68, "total_tokens": 278473638 }, { "epoch": 0.17629407351837958, "grad_norm": 0.957187831401825, "learning_rate": 2e-05, "loss": 0.7311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2820, "tokens_per_second_per_gpu": 17660.71, "total_tokens": 278570319 }, { "epoch": 0.17635658914728683, "grad_norm": 0.9848501086235046, "learning_rate": 2e-05, "loss": 0.7242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2821, "tokens_per_second_per_gpu": 16731.65, "total_tokens": 278667638 }, { "epoch": 0.17641910477619405, "grad_norm": 0.920342206954956, "learning_rate": 2e-05, "loss": 0.7311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2822, "tokens_per_second_per_gpu": 17093.64, "total_tokens": 278764605 }, { "epoch": 0.17648162040510126, "grad_norm": 0.9593011736869812, "learning_rate": 2e-05, "loss": 0.7632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2823, "tokens_per_second_per_gpu": 18586.04, "total_tokens": 278866341 }, { "epoch": 0.1765441360340085, "grad_norm": 0.9718986749649048, "learning_rate": 2e-05, "loss": 0.7161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2824, "tokens_per_second_per_gpu": 15728.79, "total_tokens": 278962678 }, { "epoch": 0.17660665166291573, "grad_norm": 1.0004853010177612, "learning_rate": 2e-05, "loss": 0.7097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2825, "tokens_per_second_per_gpu": 17765.34, "total_tokens": 279063647 }, { "epoch": 0.17666916729182294, "grad_norm": 0.891535222530365, "learning_rate": 2e-05, "loss": 0.715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2826, "tokens_per_second_per_gpu": 18730.56, "total_tokens": 279165838 }, { "epoch": 0.1767316829207302, "grad_norm": 0.9218605756759644, "learning_rate": 2e-05, "loss": 0.7329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2827, "tokens_per_second_per_gpu": 16309.2, "total_tokens": 279263160 }, { "epoch": 0.1767941985496374, "grad_norm": 1.0008153915405273, "learning_rate": 2e-05, "loss": 0.7, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2828, "tokens_per_second_per_gpu": 16755.53, "total_tokens": 279360590 }, { "epoch": 0.17685671417854465, "grad_norm": 0.9253450632095337, "learning_rate": 2e-05, "loss": 0.7158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2829, "tokens_per_second_per_gpu": 17456.35, "total_tokens": 279459117 }, { "epoch": 0.17691922980745187, "grad_norm": 0.8856090307235718, "learning_rate": 2e-05, "loss": 0.7225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2830, "tokens_per_second_per_gpu": 17604.59, "total_tokens": 279559076 }, { "epoch": 0.17698174543635908, "grad_norm": 0.9120262861251831, "learning_rate": 2e-05, "loss": 0.7488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2831, "tokens_per_second_per_gpu": 18317.97, "total_tokens": 279661882 }, { "epoch": 0.17704426106526633, "grad_norm": 0.9646255970001221, "learning_rate": 2e-05, "loss": 0.7518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2832, "tokens_per_second_per_gpu": 18705.68, "total_tokens": 279766678 }, { "epoch": 0.17710677669417355, "grad_norm": 0.9118945598602295, "learning_rate": 2e-05, "loss": 0.6933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2833, "tokens_per_second_per_gpu": 16794.75, "total_tokens": 279864175 }, { "epoch": 0.17716929232308076, "grad_norm": 0.896767795085907, "learning_rate": 2e-05, "loss": 0.7145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2834, "tokens_per_second_per_gpu": 17304.33, "total_tokens": 279965454 }, { "epoch": 0.177231807951988, "grad_norm": 0.9012964367866516, "learning_rate": 2e-05, "loss": 0.7683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2835, "tokens_per_second_per_gpu": 17581.16, "total_tokens": 280065493 }, { "epoch": 0.17729432358089522, "grad_norm": 0.9251433610916138, "learning_rate": 2e-05, "loss": 0.7232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2836, "tokens_per_second_per_gpu": 17215.94, "total_tokens": 280165935 }, { "epoch": 0.17735683920980244, "grad_norm": 0.9161660075187683, "learning_rate": 2e-05, "loss": 0.7074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2837, "tokens_per_second_per_gpu": 16685.7, "total_tokens": 280263094 }, { "epoch": 0.1774193548387097, "grad_norm": 0.9152282476425171, "learning_rate": 2e-05, "loss": 0.7366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2838, "tokens_per_second_per_gpu": 18819.59, "total_tokens": 280364824 }, { "epoch": 0.1774818704676169, "grad_norm": 0.9101372361183167, "learning_rate": 2e-05, "loss": 0.7294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2839, "tokens_per_second_per_gpu": 17329.06, "total_tokens": 280463592 }, { "epoch": 0.17754438609652412, "grad_norm": 0.9376832842826843, "learning_rate": 2e-05, "loss": 0.7074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2840, "tokens_per_second_per_gpu": 16541.0, "total_tokens": 280559552 }, { "epoch": 0.17760690172543137, "grad_norm": 0.9153850078582764, "learning_rate": 2e-05, "loss": 0.7231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2841, "tokens_per_second_per_gpu": 17755.94, "total_tokens": 280661407 }, { "epoch": 0.17766941735433858, "grad_norm": 0.8825836181640625, "learning_rate": 2e-05, "loss": 0.7106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2842, "tokens_per_second_per_gpu": 17182.03, "total_tokens": 280758603 }, { "epoch": 0.1777319329832458, "grad_norm": 0.9125210046768188, "learning_rate": 2e-05, "loss": 0.7213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2843, "tokens_per_second_per_gpu": 17607.94, "total_tokens": 280856703 }, { "epoch": 0.17779444861215304, "grad_norm": 0.9145292639732361, "learning_rate": 2e-05, "loss": 0.7151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2844, "tokens_per_second_per_gpu": 17577.01, "total_tokens": 280956115 }, { "epoch": 0.17785696424106026, "grad_norm": 0.9071593880653381, "learning_rate": 2e-05, "loss": 0.7279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2845, "tokens_per_second_per_gpu": 18077.74, "total_tokens": 281057828 }, { "epoch": 0.17791947986996748, "grad_norm": 0.9192315936088562, "learning_rate": 2e-05, "loss": 0.7228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2846, "tokens_per_second_per_gpu": 16975.48, "total_tokens": 281154828 }, { "epoch": 0.17798199549887472, "grad_norm": 0.9188458919525146, "learning_rate": 2e-05, "loss": 0.7516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2847, "tokens_per_second_per_gpu": 18234.96, "total_tokens": 281258101 }, { "epoch": 0.17804451112778194, "grad_norm": 0.9324948191642761, "learning_rate": 2e-05, "loss": 0.7148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2848, "tokens_per_second_per_gpu": 17395.81, "total_tokens": 281356610 }, { "epoch": 0.17810702675668919, "grad_norm": 0.9201919436454773, "learning_rate": 2e-05, "loss": 0.6804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2849, "tokens_per_second_per_gpu": 15485.53, "total_tokens": 281450422 }, { "epoch": 0.1781695423855964, "grad_norm": 0.9357964992523193, "learning_rate": 2e-05, "loss": 0.7057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2850, "tokens_per_second_per_gpu": 16969.07, "total_tokens": 281548571 }, { "epoch": 0.17823205801450362, "grad_norm": 0.904999852180481, "learning_rate": 2e-05, "loss": 0.6781, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2851, "tokens_per_second_per_gpu": 17144.61, "total_tokens": 281644813 }, { "epoch": 0.17829457364341086, "grad_norm": 0.9823085069656372, "learning_rate": 2e-05, "loss": 0.7146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2852, "tokens_per_second_per_gpu": 16771.33, "total_tokens": 281740151 }, { "epoch": 0.17835708927231808, "grad_norm": 0.905296802520752, "learning_rate": 2e-05, "loss": 0.7204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2853, "tokens_per_second_per_gpu": 16887.59, "total_tokens": 281841568 }, { "epoch": 0.1784196049012253, "grad_norm": 0.9191571474075317, "learning_rate": 2e-05, "loss": 0.7679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2854, "tokens_per_second_per_gpu": 17065.27, "total_tokens": 281942056 }, { "epoch": 0.17848212053013254, "grad_norm": 0.920495331287384, "learning_rate": 2e-05, "loss": 0.7236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2855, "tokens_per_second_per_gpu": 17358.6, "total_tokens": 282041983 }, { "epoch": 0.17854463615903976, "grad_norm": 0.9469814896583557, "learning_rate": 2e-05, "loss": 0.7338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2856, "tokens_per_second_per_gpu": 16814.47, "total_tokens": 282140646 }, { "epoch": 0.17860715178794698, "grad_norm": 0.8865935206413269, "learning_rate": 2e-05, "loss": 0.767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2857, "tokens_per_second_per_gpu": 19318.02, "total_tokens": 282247198 }, { "epoch": 0.17866966741685422, "grad_norm": 0.8785930871963501, "learning_rate": 2e-05, "loss": 0.7412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2858, "tokens_per_second_per_gpu": 17468.81, "total_tokens": 282350973 }, { "epoch": 0.17873218304576144, "grad_norm": 0.9013646245002747, "learning_rate": 2e-05, "loss": 0.707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2859, "tokens_per_second_per_gpu": 16583.28, "total_tokens": 282448919 }, { "epoch": 0.17879469867466866, "grad_norm": 0.9552802443504333, "learning_rate": 2e-05, "loss": 0.7145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2860, "tokens_per_second_per_gpu": 16973.29, "total_tokens": 282545164 }, { "epoch": 0.1788572143035759, "grad_norm": 0.897283136844635, "learning_rate": 2e-05, "loss": 0.7234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2861, "tokens_per_second_per_gpu": 18301.7, "total_tokens": 282645578 }, { "epoch": 0.17891972993248312, "grad_norm": 0.9144786596298218, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2862, "tokens_per_second_per_gpu": 16921.83, "total_tokens": 282745175 }, { "epoch": 0.17898224556139034, "grad_norm": 0.9215632081031799, "learning_rate": 2e-05, "loss": 0.73, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2863, "tokens_per_second_per_gpu": 15694.32, "total_tokens": 282838236 }, { "epoch": 0.17904476119029758, "grad_norm": 0.9152722358703613, "learning_rate": 2e-05, "loss": 0.743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2864, "tokens_per_second_per_gpu": 16978.87, "total_tokens": 282939615 }, { "epoch": 0.1791072768192048, "grad_norm": 0.9293508529663086, "learning_rate": 2e-05, "loss": 0.732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2865, "tokens_per_second_per_gpu": 17610.15, "total_tokens": 283042062 }, { "epoch": 0.17916979244811204, "grad_norm": 0.935510516166687, "learning_rate": 2e-05, "loss": 0.7017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2866, "tokens_per_second_per_gpu": 16334.06, "total_tokens": 283136796 }, { "epoch": 0.17923230807701926, "grad_norm": 0.8921712636947632, "learning_rate": 2e-05, "loss": 0.7235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2867, "tokens_per_second_per_gpu": 18096.82, "total_tokens": 283238749 }, { "epoch": 0.17929482370592648, "grad_norm": 0.9105206727981567, "learning_rate": 2e-05, "loss": 0.7117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2868, "tokens_per_second_per_gpu": 17755.36, "total_tokens": 283337214 }, { "epoch": 0.17935733933483372, "grad_norm": 0.9150083065032959, "learning_rate": 2e-05, "loss": 0.7179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2869, "tokens_per_second_per_gpu": 16650.43, "total_tokens": 283437529 }, { "epoch": 0.17941985496374094, "grad_norm": 0.8979609608650208, "learning_rate": 2e-05, "loss": 0.6959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2870, "tokens_per_second_per_gpu": 17404.91, "total_tokens": 283533899 }, { "epoch": 0.17948237059264815, "grad_norm": 0.9318906664848328, "learning_rate": 2e-05, "loss": 0.7251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2871, "tokens_per_second_per_gpu": 17439.16, "total_tokens": 283633156 }, { "epoch": 0.1795448862215554, "grad_norm": 0.9203872680664062, "learning_rate": 2e-05, "loss": 0.7485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2872, "tokens_per_second_per_gpu": 17228.38, "total_tokens": 283732571 }, { "epoch": 0.17960740185046262, "grad_norm": 0.8961928486824036, "learning_rate": 2e-05, "loss": 0.7527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2873, "tokens_per_second_per_gpu": 17829.44, "total_tokens": 283834786 }, { "epoch": 0.17966991747936983, "grad_norm": 0.933614194393158, "learning_rate": 2e-05, "loss": 0.7847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2874, "tokens_per_second_per_gpu": 17562.06, "total_tokens": 283937934 }, { "epoch": 0.17973243310827708, "grad_norm": 0.8958435654640198, "learning_rate": 2e-05, "loss": 0.717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2875, "tokens_per_second_per_gpu": 17509.03, "total_tokens": 284039996 }, { "epoch": 0.1797949487371843, "grad_norm": 0.9190083742141724, "learning_rate": 2e-05, "loss": 0.692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2876, "tokens_per_second_per_gpu": 17342.66, "total_tokens": 284140486 }, { "epoch": 0.1798574643660915, "grad_norm": 0.9429627060890198, "learning_rate": 2e-05, "loss": 0.7377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2877, "tokens_per_second_per_gpu": 17640.21, "total_tokens": 284241039 }, { "epoch": 0.17991997999499876, "grad_norm": 0.907944917678833, "learning_rate": 2e-05, "loss": 0.6888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2878, "tokens_per_second_per_gpu": 16676.93, "total_tokens": 284336690 }, { "epoch": 0.17998249562390597, "grad_norm": 0.8831362128257751, "learning_rate": 2e-05, "loss": 0.7085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2879, "tokens_per_second_per_gpu": 18507.73, "total_tokens": 284438770 }, { "epoch": 0.1800450112528132, "grad_norm": 0.925061047077179, "learning_rate": 2e-05, "loss": 0.7578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2880, "tokens_per_second_per_gpu": 18371.39, "total_tokens": 284540538 }, { "epoch": 0.18010752688172044, "grad_norm": 0.881275475025177, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2881, "tokens_per_second_per_gpu": 17579.73, "total_tokens": 284638680 }, { "epoch": 0.18017004251062765, "grad_norm": 0.9116279482841492, "learning_rate": 2e-05, "loss": 0.7148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2882, "tokens_per_second_per_gpu": 17615.27, "total_tokens": 284736511 }, { "epoch": 0.18023255813953487, "grad_norm": 0.9194029569625854, "learning_rate": 2e-05, "loss": 0.7065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2883, "tokens_per_second_per_gpu": 17796.81, "total_tokens": 284835774 }, { "epoch": 0.18029507376844212, "grad_norm": 0.9054694175720215, "learning_rate": 2e-05, "loss": 0.7606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2884, "tokens_per_second_per_gpu": 18675.79, "total_tokens": 284939185 }, { "epoch": 0.18035758939734933, "grad_norm": 0.9157474637031555, "learning_rate": 2e-05, "loss": 0.7312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2885, "tokens_per_second_per_gpu": 17466.57, "total_tokens": 285037823 }, { "epoch": 0.18042010502625658, "grad_norm": 0.9725309610366821, "learning_rate": 2e-05, "loss": 0.7266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2886, "tokens_per_second_per_gpu": 16778.94, "total_tokens": 285135167 }, { "epoch": 0.1804826206551638, "grad_norm": 0.8761250376701355, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2887, "tokens_per_second_per_gpu": 17080.92, "total_tokens": 285234116 }, { "epoch": 0.180545136284071, "grad_norm": 0.9167973399162292, "learning_rate": 2e-05, "loss": 0.7284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2888, "tokens_per_second_per_gpu": 16680.55, "total_tokens": 285330534 }, { "epoch": 0.18060765191297826, "grad_norm": 0.9207567572593689, "learning_rate": 2e-05, "loss": 0.7242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2889, "tokens_per_second_per_gpu": 16895.87, "total_tokens": 285428671 }, { "epoch": 0.18067016754188547, "grad_norm": 0.9022278189659119, "learning_rate": 2e-05, "loss": 0.7373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2890, "tokens_per_second_per_gpu": 17389.86, "total_tokens": 285532256 }, { "epoch": 0.1807326831707927, "grad_norm": 0.9371418952941895, "learning_rate": 2e-05, "loss": 0.7348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2891, "tokens_per_second_per_gpu": 17938.9, "total_tokens": 285632023 }, { "epoch": 0.18079519879969994, "grad_norm": 0.9235061407089233, "learning_rate": 2e-05, "loss": 0.7626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2892, "tokens_per_second_per_gpu": 16817.25, "total_tokens": 285731853 }, { "epoch": 0.18085771442860715, "grad_norm": 0.9128032326698303, "learning_rate": 2e-05, "loss": 0.6998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2893, "tokens_per_second_per_gpu": 17866.28, "total_tokens": 285831851 }, { "epoch": 0.18092023005751437, "grad_norm": 0.9596571326255798, "learning_rate": 2e-05, "loss": 0.7817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2894, "tokens_per_second_per_gpu": 17395.96, "total_tokens": 285933067 }, { "epoch": 0.18098274568642161, "grad_norm": 0.911215603351593, "learning_rate": 2e-05, "loss": 0.6851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2895, "tokens_per_second_per_gpu": 15710.02, "total_tokens": 286028985 }, { "epoch": 0.18104526131532883, "grad_norm": 0.9296978116035461, "learning_rate": 2e-05, "loss": 0.6785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2896, "tokens_per_second_per_gpu": 18284.31, "total_tokens": 286127537 }, { "epoch": 0.18110777694423605, "grad_norm": 0.8819538354873657, "learning_rate": 2e-05, "loss": 0.7199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2897, "tokens_per_second_per_gpu": 17346.52, "total_tokens": 286230455 }, { "epoch": 0.1811702925731433, "grad_norm": 0.9039362072944641, "learning_rate": 2e-05, "loss": 0.7658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2898, "tokens_per_second_per_gpu": 17043.58, "total_tokens": 286331270 }, { "epoch": 0.1812328082020505, "grad_norm": 0.95121169090271, "learning_rate": 2e-05, "loss": 0.7165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2899, "tokens_per_second_per_gpu": 17406.47, "total_tokens": 286427552 }, { "epoch": 0.18129532383095773, "grad_norm": 0.9372190833091736, "learning_rate": 2e-05, "loss": 0.7431, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2900, "tokens_per_second_per_gpu": 17690.26, "total_tokens": 286525470 }, { "epoch": 0.18135783945986497, "grad_norm": 0.9106340408325195, "learning_rate": 2e-05, "loss": 0.7178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2901, "tokens_per_second_per_gpu": 17110.91, "total_tokens": 286621992 }, { "epoch": 0.1814203550887722, "grad_norm": 0.9058762192726135, "learning_rate": 2e-05, "loss": 0.7075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2902, "tokens_per_second_per_gpu": 17058.7, "total_tokens": 286719447 }, { "epoch": 0.1814828707176794, "grad_norm": 0.8944225311279297, "learning_rate": 2e-05, "loss": 0.7082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2903, "tokens_per_second_per_gpu": 16700.76, "total_tokens": 286816138 }, { "epoch": 0.18154538634658665, "grad_norm": 0.9863740801811218, "learning_rate": 2e-05, "loss": 0.6875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2904, "tokens_per_second_per_gpu": 17118.21, "total_tokens": 286910525 }, { "epoch": 0.18160790197549387, "grad_norm": 1.0113605260849, "learning_rate": 2e-05, "loss": 0.7306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2905, "tokens_per_second_per_gpu": 17266.92, "total_tokens": 287007034 }, { "epoch": 0.1816704176044011, "grad_norm": 0.9208977818489075, "learning_rate": 2e-05, "loss": 0.7583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2906, "tokens_per_second_per_gpu": 17529.2, "total_tokens": 287105430 }, { "epoch": 0.18173293323330833, "grad_norm": 0.9016545414924622, "learning_rate": 2e-05, "loss": 0.7197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2907, "tokens_per_second_per_gpu": 17617.12, "total_tokens": 287207032 }, { "epoch": 0.18179544886221555, "grad_norm": 0.91457200050354, "learning_rate": 2e-05, "loss": 0.7178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2908, "tokens_per_second_per_gpu": 19137.37, "total_tokens": 287309429 }, { "epoch": 0.1818579644911228, "grad_norm": 0.9612566232681274, "learning_rate": 2e-05, "loss": 0.7246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2909, "tokens_per_second_per_gpu": 18376.5, "total_tokens": 287412475 }, { "epoch": 0.18192048012003, "grad_norm": 0.9038111567497253, "learning_rate": 2e-05, "loss": 0.6807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2910, "tokens_per_second_per_gpu": 17064.05, "total_tokens": 287511547 }, { "epoch": 0.18198299574893723, "grad_norm": 0.9583487510681152, "learning_rate": 2e-05, "loss": 0.7645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2911, "tokens_per_second_per_gpu": 17798.51, "total_tokens": 287612400 }, { "epoch": 0.18204551137784447, "grad_norm": 0.9471487402915955, "learning_rate": 2e-05, "loss": 0.7422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2912, "tokens_per_second_per_gpu": 17581.13, "total_tokens": 287712087 }, { "epoch": 0.1821080270067517, "grad_norm": 0.9791850447654724, "learning_rate": 2e-05, "loss": 0.7045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2913, "tokens_per_second_per_gpu": 18826.13, "total_tokens": 287811718 }, { "epoch": 0.1821705426356589, "grad_norm": 0.9207985401153564, "learning_rate": 2e-05, "loss": 0.7451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2914, "tokens_per_second_per_gpu": 17948.29, "total_tokens": 287913439 }, { "epoch": 0.18223305826456615, "grad_norm": 0.9191470146179199, "learning_rate": 2e-05, "loss": 0.7368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2915, "tokens_per_second_per_gpu": 18201.05, "total_tokens": 288014886 }, { "epoch": 0.18229557389347337, "grad_norm": 0.9359345436096191, "learning_rate": 2e-05, "loss": 0.6892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2916, "tokens_per_second_per_gpu": 17085.85, "total_tokens": 288108593 }, { "epoch": 0.18235808952238058, "grad_norm": 0.9927412867546082, "learning_rate": 2e-05, "loss": 0.6918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2917, "tokens_per_second_per_gpu": 16194.79, "total_tokens": 288203796 }, { "epoch": 0.18242060515128783, "grad_norm": 0.905016303062439, "learning_rate": 2e-05, "loss": 0.7173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2918, "tokens_per_second_per_gpu": 17195.85, "total_tokens": 288301815 }, { "epoch": 0.18248312078019505, "grad_norm": 0.9076579213142395, "learning_rate": 2e-05, "loss": 0.7313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2919, "tokens_per_second_per_gpu": 17729.43, "total_tokens": 288405440 }, { "epoch": 0.18254563640910226, "grad_norm": 0.9272303581237793, "learning_rate": 2e-05, "loss": 0.7126, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2920, "tokens_per_second_per_gpu": 16384.94, "total_tokens": 288501480 }, { "epoch": 0.1826081520380095, "grad_norm": 0.8826347589492798, "learning_rate": 2e-05, "loss": 0.7075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2921, "tokens_per_second_per_gpu": 17745.8, "total_tokens": 288599968 }, { "epoch": 0.18267066766691673, "grad_norm": 0.9422284960746765, "learning_rate": 2e-05, "loss": 0.6986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2922, "tokens_per_second_per_gpu": 15849.53, "total_tokens": 288693721 }, { "epoch": 0.18273318329582394, "grad_norm": 0.9095689654350281, "learning_rate": 2e-05, "loss": 0.7342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2923, "tokens_per_second_per_gpu": 18309.01, "total_tokens": 288796974 }, { "epoch": 0.1827956989247312, "grad_norm": 0.9400908350944519, "learning_rate": 2e-05, "loss": 0.7139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2924, "tokens_per_second_per_gpu": 18057.22, "total_tokens": 288896154 }, { "epoch": 0.1828582145536384, "grad_norm": 0.9451636075973511, "learning_rate": 2e-05, "loss": 0.7347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2925, "tokens_per_second_per_gpu": 17298.44, "total_tokens": 288994304 }, { "epoch": 0.18292073018254565, "grad_norm": 0.9357348680496216, "learning_rate": 2e-05, "loss": 0.7537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2926, "tokens_per_second_per_gpu": 18504.27, "total_tokens": 289097864 }, { "epoch": 0.18298324581145287, "grad_norm": 0.9302645325660706, "learning_rate": 2e-05, "loss": 0.7552, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2927, "tokens_per_second_per_gpu": 17770.45, "total_tokens": 289199136 }, { "epoch": 0.18304576144036008, "grad_norm": 0.8864611387252808, "learning_rate": 2e-05, "loss": 0.7039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2928, "tokens_per_second_per_gpu": 16179.22, "total_tokens": 289293335 }, { "epoch": 0.18310827706926733, "grad_norm": 0.9253182411193848, "learning_rate": 2e-05, "loss": 0.7709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2929, "tokens_per_second_per_gpu": 16105.49, "total_tokens": 289389191 }, { "epoch": 0.18317079269817454, "grad_norm": 0.9137840270996094, "learning_rate": 2e-05, "loss": 0.7371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2930, "tokens_per_second_per_gpu": 16732.12, "total_tokens": 289484269 }, { "epoch": 0.18323330832708176, "grad_norm": 0.9047098755836487, "learning_rate": 2e-05, "loss": 0.6929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2931, "tokens_per_second_per_gpu": 18499.2, "total_tokens": 289583427 }, { "epoch": 0.183295823955989, "grad_norm": 1.0043891668319702, "learning_rate": 2e-05, "loss": 0.6989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2932, "tokens_per_second_per_gpu": 17915.86, "total_tokens": 289684205 }, { "epoch": 0.18335833958489622, "grad_norm": 0.9507048726081848, "learning_rate": 2e-05, "loss": 0.7283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2933, "tokens_per_second_per_gpu": 17292.21, "total_tokens": 289780880 }, { "epoch": 0.18342085521380344, "grad_norm": 0.883039116859436, "learning_rate": 2e-05, "loss": 0.7102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2934, "tokens_per_second_per_gpu": 18577.78, "total_tokens": 289885499 }, { "epoch": 0.18348337084271069, "grad_norm": 0.9018869400024414, "learning_rate": 2e-05, "loss": 0.706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2935, "tokens_per_second_per_gpu": 18494.95, "total_tokens": 289986696 }, { "epoch": 0.1835458864716179, "grad_norm": 0.9680882692337036, "learning_rate": 2e-05, "loss": 0.7393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2936, "tokens_per_second_per_gpu": 16687.75, "total_tokens": 290084515 }, { "epoch": 0.18360840210052512, "grad_norm": 0.903013288974762, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2937, "tokens_per_second_per_gpu": 18326.93, "total_tokens": 290184383 }, { "epoch": 0.18367091772943236, "grad_norm": 0.9523878693580627, "learning_rate": 2e-05, "loss": 0.7423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2938, "tokens_per_second_per_gpu": 16894.4, "total_tokens": 290281202 }, { "epoch": 0.18373343335833958, "grad_norm": 0.9629015922546387, "learning_rate": 2e-05, "loss": 0.7493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2939, "tokens_per_second_per_gpu": 17745.88, "total_tokens": 290382732 }, { "epoch": 0.1837959489872468, "grad_norm": 0.9661985039710999, "learning_rate": 2e-05, "loss": 0.738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2940, "tokens_per_second_per_gpu": 16683.74, "total_tokens": 290480377 }, { "epoch": 0.18385846461615404, "grad_norm": 0.9208330512046814, "learning_rate": 2e-05, "loss": 0.7075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2941, "tokens_per_second_per_gpu": 17131.89, "total_tokens": 290579208 }, { "epoch": 0.18392098024506126, "grad_norm": 0.8933903574943542, "learning_rate": 2e-05, "loss": 0.6659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2942, "tokens_per_second_per_gpu": 16202.81, "total_tokens": 290676654 }, { "epoch": 0.1839834958739685, "grad_norm": 0.9191637635231018, "learning_rate": 2e-05, "loss": 0.7043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2943, "tokens_per_second_per_gpu": 16809.12, "total_tokens": 290772147 }, { "epoch": 0.18404601150287572, "grad_norm": 0.9875161647796631, "learning_rate": 2e-05, "loss": 0.7556, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2944, "tokens_per_second_per_gpu": 18333.21, "total_tokens": 290872836 }, { "epoch": 0.18410852713178294, "grad_norm": 0.9775146842002869, "learning_rate": 2e-05, "loss": 0.7191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2945, "tokens_per_second_per_gpu": 17206.66, "total_tokens": 290970902 }, { "epoch": 0.18417104276069018, "grad_norm": 0.9681472778320312, "learning_rate": 2e-05, "loss": 0.7941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2946, "tokens_per_second_per_gpu": 17734.48, "total_tokens": 291072410 }, { "epoch": 0.1842335583895974, "grad_norm": 0.9361527562141418, "learning_rate": 2e-05, "loss": 0.7357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2947, "tokens_per_second_per_gpu": 17669.17, "total_tokens": 291174245 }, { "epoch": 0.18429607401850462, "grad_norm": 0.9912847876548767, "learning_rate": 2e-05, "loss": 0.7597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2948, "tokens_per_second_per_gpu": 16486.74, "total_tokens": 291272442 }, { "epoch": 0.18435858964741186, "grad_norm": 0.9346024394035339, "learning_rate": 2e-05, "loss": 0.7277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2949, "tokens_per_second_per_gpu": 17638.16, "total_tokens": 291373172 }, { "epoch": 0.18442110527631908, "grad_norm": 0.9247661232948303, "learning_rate": 2e-05, "loss": 0.7398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2950, "tokens_per_second_per_gpu": 17078.88, "total_tokens": 291471990 }, { "epoch": 0.1844836209052263, "grad_norm": 0.9179471135139465, "learning_rate": 2e-05, "loss": 0.7231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2951, "tokens_per_second_per_gpu": 18017.78, "total_tokens": 291572839 }, { "epoch": 0.18454613653413354, "grad_norm": 0.9856420159339905, "learning_rate": 2e-05, "loss": 0.7066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2952, "tokens_per_second_per_gpu": 16693.83, "total_tokens": 291670185 }, { "epoch": 0.18460865216304076, "grad_norm": 0.9324393272399902, "learning_rate": 2e-05, "loss": 0.7477, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2953, "tokens_per_second_per_gpu": 18043.24, "total_tokens": 291771326 }, { "epoch": 0.18467116779194798, "grad_norm": 0.9726861119270325, "learning_rate": 2e-05, "loss": 0.7442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2954, "tokens_per_second_per_gpu": 17428.43, "total_tokens": 291866742 }, { "epoch": 0.18473368342085522, "grad_norm": 0.8841732740402222, "learning_rate": 2e-05, "loss": 0.7314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2955, "tokens_per_second_per_gpu": 17154.68, "total_tokens": 291967106 }, { "epoch": 0.18479619904976244, "grad_norm": 0.9164668917655945, "learning_rate": 2e-05, "loss": 0.708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2956, "tokens_per_second_per_gpu": 17186.3, "total_tokens": 292065048 }, { "epoch": 0.18485871467866966, "grad_norm": 0.9168412685394287, "learning_rate": 2e-05, "loss": 0.7048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2957, "tokens_per_second_per_gpu": 17637.07, "total_tokens": 292165315 }, { "epoch": 0.1849212303075769, "grad_norm": 0.9158667922019958, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2958, "tokens_per_second_per_gpu": 16167.14, "total_tokens": 292262655 }, { "epoch": 0.18498374593648412, "grad_norm": 0.9779559373855591, "learning_rate": 2e-05, "loss": 0.7411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2959, "tokens_per_second_per_gpu": 15857.05, "total_tokens": 292355937 }, { "epoch": 0.18504626156539133, "grad_norm": 0.9297930598258972, "learning_rate": 2e-05, "loss": 0.7336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2960, "tokens_per_second_per_gpu": 18151.39, "total_tokens": 292455609 }, { "epoch": 0.18510877719429858, "grad_norm": 0.9248693585395813, "learning_rate": 2e-05, "loss": 0.6989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2961, "tokens_per_second_per_gpu": 17512.91, "total_tokens": 292554315 }, { "epoch": 0.1851712928232058, "grad_norm": 0.9467350244522095, "learning_rate": 2e-05, "loss": 0.7312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2962, "tokens_per_second_per_gpu": 18500.15, "total_tokens": 292657427 }, { "epoch": 0.18523380845211304, "grad_norm": 0.8689382076263428, "learning_rate": 2e-05, "loss": 0.7002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2963, "tokens_per_second_per_gpu": 17937.6, "total_tokens": 292757967 }, { "epoch": 0.18529632408102026, "grad_norm": 0.9188850522041321, "learning_rate": 2e-05, "loss": 0.72, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2964, "tokens_per_second_per_gpu": 17752.03, "total_tokens": 292857858 }, { "epoch": 0.18535883970992748, "grad_norm": 0.9355546236038208, "learning_rate": 2e-05, "loss": 0.7284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2965, "tokens_per_second_per_gpu": 16411.86, "total_tokens": 292955511 }, { "epoch": 0.18542135533883472, "grad_norm": 0.9348012804985046, "learning_rate": 2e-05, "loss": 0.6804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2966, "tokens_per_second_per_gpu": 16754.15, "total_tokens": 293053231 }, { "epoch": 0.18548387096774194, "grad_norm": 0.9085249900817871, "learning_rate": 2e-05, "loss": 0.744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2967, "tokens_per_second_per_gpu": 16980.32, "total_tokens": 293151718 }, { "epoch": 0.18554638659664915, "grad_norm": 0.8575968742370605, "learning_rate": 2e-05, "loss": 0.6922, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2968, "tokens_per_second_per_gpu": 17683.31, "total_tokens": 293253442 }, { "epoch": 0.1856089022255564, "grad_norm": 0.8935425281524658, "learning_rate": 2e-05, "loss": 0.7209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2969, "tokens_per_second_per_gpu": 17764.69, "total_tokens": 293354135 }, { "epoch": 0.18567141785446362, "grad_norm": 0.9031855463981628, "learning_rate": 2e-05, "loss": 0.7387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2970, "tokens_per_second_per_gpu": 17425.29, "total_tokens": 293455873 }, { "epoch": 0.18573393348337083, "grad_norm": 0.9376472234725952, "learning_rate": 2e-05, "loss": 0.7083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2971, "tokens_per_second_per_gpu": 18271.97, "total_tokens": 293554377 }, { "epoch": 0.18579644911227808, "grad_norm": 0.9088535904884338, "learning_rate": 2e-05, "loss": 0.6859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2972, "tokens_per_second_per_gpu": 17374.79, "total_tokens": 293652079 }, { "epoch": 0.1858589647411853, "grad_norm": 0.9356797933578491, "learning_rate": 2e-05, "loss": 0.6749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2973, "tokens_per_second_per_gpu": 17081.17, "total_tokens": 293748898 }, { "epoch": 0.1859214803700925, "grad_norm": 0.9272909164428711, "learning_rate": 2e-05, "loss": 0.704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2974, "tokens_per_second_per_gpu": 16895.15, "total_tokens": 293846808 }, { "epoch": 0.18598399599899976, "grad_norm": 0.9723256230354309, "learning_rate": 2e-05, "loss": 0.7774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2975, "tokens_per_second_per_gpu": 17384.14, "total_tokens": 293945190 }, { "epoch": 0.18604651162790697, "grad_norm": 0.9052252173423767, "learning_rate": 2e-05, "loss": 0.7043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2976, "tokens_per_second_per_gpu": 17915.5, "total_tokens": 294044931 }, { "epoch": 0.1861090272568142, "grad_norm": 0.9276095032691956, "learning_rate": 2e-05, "loss": 0.7545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2977, "tokens_per_second_per_gpu": 17369.63, "total_tokens": 294144070 }, { "epoch": 0.18617154288572144, "grad_norm": 0.9100381731987, "learning_rate": 2e-05, "loss": 0.7645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2978, "tokens_per_second_per_gpu": 17975.44, "total_tokens": 294248233 }, { "epoch": 0.18623405851462865, "grad_norm": 0.8993049263954163, "learning_rate": 2e-05, "loss": 0.6974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2979, "tokens_per_second_per_gpu": 17800.57, "total_tokens": 294347759 }, { "epoch": 0.18629657414353587, "grad_norm": 0.9074593186378479, "learning_rate": 2e-05, "loss": 0.7004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2980, "tokens_per_second_per_gpu": 17870.74, "total_tokens": 294447172 }, { "epoch": 0.18635908977244312, "grad_norm": 0.8906053304672241, "learning_rate": 2e-05, "loss": 0.6936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2981, "tokens_per_second_per_gpu": 18004.93, "total_tokens": 294549009 }, { "epoch": 0.18642160540135033, "grad_norm": 0.9000892639160156, "learning_rate": 2e-05, "loss": 0.7238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2982, "tokens_per_second_per_gpu": 18347.57, "total_tokens": 294652397 }, { "epoch": 0.18648412103025758, "grad_norm": 0.9545291066169739, "learning_rate": 2e-05, "loss": 0.7514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2983, "tokens_per_second_per_gpu": 17021.18, "total_tokens": 294751288 }, { "epoch": 0.1865466366591648, "grad_norm": 0.9087956547737122, "learning_rate": 2e-05, "loss": 0.7133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2984, "tokens_per_second_per_gpu": 17511.59, "total_tokens": 294850132 }, { "epoch": 0.186609152288072, "grad_norm": 0.8927509784698486, "learning_rate": 2e-05, "loss": 0.7218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2985, "tokens_per_second_per_gpu": 17583.61, "total_tokens": 294946761 }, { "epoch": 0.18667166791697926, "grad_norm": 0.9119666814804077, "learning_rate": 2e-05, "loss": 0.7703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2986, "tokens_per_second_per_gpu": 18394.76, "total_tokens": 295049996 }, { "epoch": 0.18673418354588647, "grad_norm": 0.9343765377998352, "learning_rate": 2e-05, "loss": 0.7003, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2987, "tokens_per_second_per_gpu": 16958.46, "total_tokens": 295145055 }, { "epoch": 0.1867966991747937, "grad_norm": 0.9088069200515747, "learning_rate": 2e-05, "loss": 0.7203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2988, "tokens_per_second_per_gpu": 16958.42, "total_tokens": 295242861 }, { "epoch": 0.18685921480370093, "grad_norm": 0.865874707698822, "learning_rate": 2e-05, "loss": 0.7482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2989, "tokens_per_second_per_gpu": 17804.88, "total_tokens": 295344466 }, { "epoch": 0.18692173043260815, "grad_norm": 0.8966962695121765, "learning_rate": 2e-05, "loss": 0.7391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2990, "tokens_per_second_per_gpu": 17014.09, "total_tokens": 295443282 }, { "epoch": 0.18698424606151537, "grad_norm": 0.9253405928611755, "learning_rate": 2e-05, "loss": 0.7334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2991, "tokens_per_second_per_gpu": 17738.73, "total_tokens": 295543140 }, { "epoch": 0.1870467616904226, "grad_norm": 0.9273169040679932, "learning_rate": 2e-05, "loss": 0.7098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2992, "tokens_per_second_per_gpu": 16309.12, "total_tokens": 295634961 }, { "epoch": 0.18710927731932983, "grad_norm": 0.9361913800239563, "learning_rate": 2e-05, "loss": 0.7284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2993, "tokens_per_second_per_gpu": 16813.69, "total_tokens": 295731107 }, { "epoch": 0.18717179294823705, "grad_norm": 0.8909609913825989, "learning_rate": 2e-05, "loss": 0.7399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2994, "tokens_per_second_per_gpu": 17493.88, "total_tokens": 295830871 }, { "epoch": 0.1872343085771443, "grad_norm": 0.8865655064582825, "learning_rate": 2e-05, "loss": 0.6688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2995, "tokens_per_second_per_gpu": 16947.76, "total_tokens": 295926732 }, { "epoch": 0.1872968242060515, "grad_norm": 0.9177238941192627, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2996, "tokens_per_second_per_gpu": 15958.73, "total_tokens": 296019913 }, { "epoch": 0.18735933983495873, "grad_norm": 0.9074978232383728, "learning_rate": 2e-05, "loss": 0.7172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2997, "tokens_per_second_per_gpu": 16860.71, "total_tokens": 296116524 }, { "epoch": 0.18742185546386597, "grad_norm": 0.9327871799468994, "learning_rate": 2e-05, "loss": 0.7248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2998, "tokens_per_second_per_gpu": 17347.46, "total_tokens": 296213223 }, { "epoch": 0.1874843710927732, "grad_norm": 0.920409619808197, "learning_rate": 2e-05, "loss": 0.7312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 2999, "tokens_per_second_per_gpu": 17311.47, "total_tokens": 296308541 }, { "epoch": 0.18754688672168043, "grad_norm": 0.9188264012336731, "learning_rate": 2e-05, "loss": 0.713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3000, "tokens_per_second_per_gpu": 17681.88, "total_tokens": 296409035 }, { "epoch": 0.18760940235058765, "grad_norm": 0.9357115030288696, "learning_rate": 2e-05, "loss": 0.6949, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3001, "tokens_per_second_per_gpu": 16909.71, "total_tokens": 296503099 }, { "epoch": 0.18767191797949487, "grad_norm": 0.889485239982605, "learning_rate": 2e-05, "loss": 0.6831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3002, "tokens_per_second_per_gpu": 16999.75, "total_tokens": 296600027 }, { "epoch": 0.1877344336084021, "grad_norm": 0.911815345287323, "learning_rate": 2e-05, "loss": 0.7077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3003, "tokens_per_second_per_gpu": 15829.83, "total_tokens": 296694988 }, { "epoch": 0.18779694923730933, "grad_norm": 0.9251092076301575, "learning_rate": 2e-05, "loss": 0.6663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3004, "tokens_per_second_per_gpu": 15295.28, "total_tokens": 296783925 }, { "epoch": 0.18785946486621655, "grad_norm": 0.8744814991950989, "learning_rate": 2e-05, "loss": 0.6612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3005, "tokens_per_second_per_gpu": 16916.82, "total_tokens": 296879556 }, { "epoch": 0.1879219804951238, "grad_norm": 0.9262176156044006, "learning_rate": 2e-05, "loss": 0.7096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3006, "tokens_per_second_per_gpu": 17064.75, "total_tokens": 296975343 }, { "epoch": 0.187984496124031, "grad_norm": 0.9275599122047424, "learning_rate": 2e-05, "loss": 0.7452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3007, "tokens_per_second_per_gpu": 18648.16, "total_tokens": 297079680 }, { "epoch": 0.18804701175293823, "grad_norm": 0.9520023465156555, "learning_rate": 2e-05, "loss": 0.7245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3008, "tokens_per_second_per_gpu": 16690.82, "total_tokens": 297172656 }, { "epoch": 0.18810952738184547, "grad_norm": 0.9508721828460693, "learning_rate": 2e-05, "loss": 0.7484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3009, "tokens_per_second_per_gpu": 17566.56, "total_tokens": 297275013 }, { "epoch": 0.1881720430107527, "grad_norm": 0.9488951563835144, "learning_rate": 2e-05, "loss": 0.7067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3010, "tokens_per_second_per_gpu": 16718.5, "total_tokens": 297371695 }, { "epoch": 0.1882345586396599, "grad_norm": 0.8942098617553711, "learning_rate": 2e-05, "loss": 0.6653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3011, "tokens_per_second_per_gpu": 15565.21, "total_tokens": 297464965 }, { "epoch": 0.18829707426856715, "grad_norm": 0.9089059829711914, "learning_rate": 2e-05, "loss": 0.7437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3012, "tokens_per_second_per_gpu": 17330.69, "total_tokens": 297565595 }, { "epoch": 0.18835958989747437, "grad_norm": 0.8964938521385193, "learning_rate": 2e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3013, "tokens_per_second_per_gpu": 16040.96, "total_tokens": 297658968 }, { "epoch": 0.18842210552638158, "grad_norm": 0.929500937461853, "learning_rate": 2e-05, "loss": 0.7145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3014, "tokens_per_second_per_gpu": 16916.8, "total_tokens": 297751772 }, { "epoch": 0.18848462115528883, "grad_norm": 0.9028908014297485, "learning_rate": 2e-05, "loss": 0.7251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3015, "tokens_per_second_per_gpu": 17238.21, "total_tokens": 297848278 }, { "epoch": 0.18854713678419605, "grad_norm": 0.8542324900627136, "learning_rate": 2e-05, "loss": 0.667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3016, "tokens_per_second_per_gpu": 16754.11, "total_tokens": 297944619 }, { "epoch": 0.18860965241310326, "grad_norm": 0.921752393245697, "learning_rate": 2e-05, "loss": 0.7257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3017, "tokens_per_second_per_gpu": 17036.54, "total_tokens": 298040960 }, { "epoch": 0.1886721680420105, "grad_norm": 0.872482419013977, "learning_rate": 2e-05, "loss": 0.6859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3018, "tokens_per_second_per_gpu": 17119.91, "total_tokens": 298135139 }, { "epoch": 0.18873468367091772, "grad_norm": 0.9143097400665283, "learning_rate": 2e-05, "loss": 0.7104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3019, "tokens_per_second_per_gpu": 15864.32, "total_tokens": 298230272 }, { "epoch": 0.18879719929982497, "grad_norm": 0.9162232279777527, "learning_rate": 2e-05, "loss": 0.7021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3020, "tokens_per_second_per_gpu": 16742.23, "total_tokens": 298327841 }, { "epoch": 0.1888597149287322, "grad_norm": 0.9323205351829529, "learning_rate": 2e-05, "loss": 0.6919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3021, "tokens_per_second_per_gpu": 17059.26, "total_tokens": 298421414 }, { "epoch": 0.1889222305576394, "grad_norm": 0.9001237154006958, "learning_rate": 2e-05, "loss": 0.6958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3022, "tokens_per_second_per_gpu": 16006.71, "total_tokens": 298519338 }, { "epoch": 0.18898474618654665, "grad_norm": 0.9046635031700134, "learning_rate": 2e-05, "loss": 0.7403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3023, "tokens_per_second_per_gpu": 18814.58, "total_tokens": 298621476 }, { "epoch": 0.18904726181545387, "grad_norm": 0.9585525393486023, "learning_rate": 2e-05, "loss": 0.75, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3024, "tokens_per_second_per_gpu": 15666.97, "total_tokens": 298714897 }, { "epoch": 0.18910977744436108, "grad_norm": 0.9641141295433044, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3025, "tokens_per_second_per_gpu": 15974.22, "total_tokens": 298806174 }, { "epoch": 0.18917229307326833, "grad_norm": 0.9459578990936279, "learning_rate": 2e-05, "loss": 0.7256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3026, "tokens_per_second_per_gpu": 17330.62, "total_tokens": 298900466 }, { "epoch": 0.18923480870217554, "grad_norm": 0.9803140163421631, "learning_rate": 2e-05, "loss": 0.7521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3027, "tokens_per_second_per_gpu": 17495.88, "total_tokens": 298998237 }, { "epoch": 0.18929732433108276, "grad_norm": 0.9247860312461853, "learning_rate": 2e-05, "loss": 0.7291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3028, "tokens_per_second_per_gpu": 16818.82, "total_tokens": 299094985 }, { "epoch": 0.18935983995999, "grad_norm": 0.9039747714996338, "learning_rate": 2e-05, "loss": 0.7172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3029, "tokens_per_second_per_gpu": 16308.36, "total_tokens": 299191131 }, { "epoch": 0.18942235558889722, "grad_norm": 0.9146566390991211, "learning_rate": 2e-05, "loss": 0.7277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3030, "tokens_per_second_per_gpu": 18089.59, "total_tokens": 299291709 }, { "epoch": 0.18948487121780444, "grad_norm": 0.9663937091827393, "learning_rate": 2e-05, "loss": 0.7544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3031, "tokens_per_second_per_gpu": 16868.38, "total_tokens": 299384990 }, { "epoch": 0.18954738684671169, "grad_norm": 0.9200735688209534, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3032, "tokens_per_second_per_gpu": 16419.12, "total_tokens": 299477818 }, { "epoch": 0.1896099024756189, "grad_norm": 0.9167922139167786, "learning_rate": 2e-05, "loss": 0.7195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3033, "tokens_per_second_per_gpu": 16297.09, "total_tokens": 299569902 }, { "epoch": 0.18967241810452612, "grad_norm": 0.9343253970146179, "learning_rate": 2e-05, "loss": 0.7142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3034, "tokens_per_second_per_gpu": 15878.33, "total_tokens": 299664773 }, { "epoch": 0.18973493373343336, "grad_norm": 0.9352260828018188, "learning_rate": 2e-05, "loss": 0.7074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3035, "tokens_per_second_per_gpu": 16596.51, "total_tokens": 299759196 }, { "epoch": 0.18979744936234058, "grad_norm": 0.918459951877594, "learning_rate": 2e-05, "loss": 0.6878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3036, "tokens_per_second_per_gpu": 17333.22, "total_tokens": 299853235 }, { "epoch": 0.1898599649912478, "grad_norm": 0.920188844203949, "learning_rate": 2e-05, "loss": 0.7588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3037, "tokens_per_second_per_gpu": 17614.94, "total_tokens": 299954342 }, { "epoch": 0.18992248062015504, "grad_norm": 0.9406430125236511, "learning_rate": 2e-05, "loss": 0.7738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3038, "tokens_per_second_per_gpu": 17833.99, "total_tokens": 300053502 }, { "epoch": 0.18998499624906226, "grad_norm": 0.8712224960327148, "learning_rate": 2e-05, "loss": 0.7152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3039, "tokens_per_second_per_gpu": 17897.96, "total_tokens": 300155114 }, { "epoch": 0.1900475118779695, "grad_norm": 0.9557879567146301, "learning_rate": 2e-05, "loss": 0.6699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3040, "tokens_per_second_per_gpu": 16855.71, "total_tokens": 300246377 }, { "epoch": 0.19011002750687672, "grad_norm": 0.9292540550231934, "learning_rate": 2e-05, "loss": 0.754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3041, "tokens_per_second_per_gpu": 18598.65, "total_tokens": 300345705 }, { "epoch": 0.19017254313578394, "grad_norm": 0.9519219398498535, "learning_rate": 2e-05, "loss": 0.7387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3042, "tokens_per_second_per_gpu": 16914.14, "total_tokens": 300445125 }, { "epoch": 0.19023505876469118, "grad_norm": 0.8867415189743042, "learning_rate": 2e-05, "loss": 0.7372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3043, "tokens_per_second_per_gpu": 18080.84, "total_tokens": 300544820 }, { "epoch": 0.1902975743935984, "grad_norm": 0.960088312625885, "learning_rate": 2e-05, "loss": 0.7102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3044, "tokens_per_second_per_gpu": 16466.46, "total_tokens": 300641677 }, { "epoch": 0.19036009002250562, "grad_norm": 0.9289615750312805, "learning_rate": 2e-05, "loss": 0.7002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3045, "tokens_per_second_per_gpu": 17329.05, "total_tokens": 300738222 }, { "epoch": 0.19042260565141286, "grad_norm": 0.9273955225944519, "learning_rate": 2e-05, "loss": 0.6969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3046, "tokens_per_second_per_gpu": 16213.35, "total_tokens": 300830609 }, { "epoch": 0.19048512128032008, "grad_norm": 0.9299726486206055, "learning_rate": 2e-05, "loss": 0.771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3047, "tokens_per_second_per_gpu": 17496.04, "total_tokens": 300929838 }, { "epoch": 0.1905476369092273, "grad_norm": 0.9892798066139221, "learning_rate": 2e-05, "loss": 0.7625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3048, "tokens_per_second_per_gpu": 18791.69, "total_tokens": 301032874 }, { "epoch": 0.19061015253813454, "grad_norm": 0.9125273823738098, "learning_rate": 2e-05, "loss": 0.7034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3049, "tokens_per_second_per_gpu": 17052.67, "total_tokens": 301132002 }, { "epoch": 0.19067266816704176, "grad_norm": 0.95670485496521, "learning_rate": 2e-05, "loss": 0.739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3050, "tokens_per_second_per_gpu": 16415.04, "total_tokens": 301227497 }, { "epoch": 0.19073518379594898, "grad_norm": 0.9628230929374695, "learning_rate": 2e-05, "loss": 0.6893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3051, "tokens_per_second_per_gpu": 16580.9, "total_tokens": 301322291 }, { "epoch": 0.19079769942485622, "grad_norm": 0.9372714757919312, "learning_rate": 2e-05, "loss": 0.732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3052, "tokens_per_second_per_gpu": 16379.87, "total_tokens": 301418820 }, { "epoch": 0.19086021505376344, "grad_norm": 0.9438914656639099, "learning_rate": 2e-05, "loss": 0.7197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3053, "tokens_per_second_per_gpu": 16976.49, "total_tokens": 301512266 }, { "epoch": 0.19092273068267065, "grad_norm": 0.9201877117156982, "learning_rate": 2e-05, "loss": 0.6943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3054, "tokens_per_second_per_gpu": 16936.09, "total_tokens": 301609966 }, { "epoch": 0.1909852463115779, "grad_norm": 0.8968002200126648, "learning_rate": 2e-05, "loss": 0.7182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3055, "tokens_per_second_per_gpu": 17881.35, "total_tokens": 301713675 }, { "epoch": 0.19104776194048512, "grad_norm": 0.9806413054466248, "learning_rate": 2e-05, "loss": 0.7036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3056, "tokens_per_second_per_gpu": 17272.59, "total_tokens": 301806522 }, { "epoch": 0.19111027756939236, "grad_norm": 0.9195153713226318, "learning_rate": 2e-05, "loss": 0.7654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3057, "tokens_per_second_per_gpu": 17542.09, "total_tokens": 301909478 }, { "epoch": 0.19117279319829958, "grad_norm": 0.9163614511489868, "learning_rate": 2e-05, "loss": 0.745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3058, "tokens_per_second_per_gpu": 17359.01, "total_tokens": 302007237 }, { "epoch": 0.1912353088272068, "grad_norm": 0.9288994073867798, "learning_rate": 2e-05, "loss": 0.7098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3059, "tokens_per_second_per_gpu": 16506.79, "total_tokens": 302104616 }, { "epoch": 0.19129782445611404, "grad_norm": 0.9159637093544006, "learning_rate": 2e-05, "loss": 0.7315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3060, "tokens_per_second_per_gpu": 18357.75, "total_tokens": 302203580 }, { "epoch": 0.19136034008502126, "grad_norm": 0.9154634475708008, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3061, "tokens_per_second_per_gpu": 14844.68, "total_tokens": 302294324 }, { "epoch": 0.19142285571392847, "grad_norm": 0.939102828502655, "learning_rate": 2e-05, "loss": 0.7087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3062, "tokens_per_second_per_gpu": 17387.61, "total_tokens": 302390277 }, { "epoch": 0.19148537134283572, "grad_norm": 0.941900908946991, "learning_rate": 2e-05, "loss": 0.7533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3063, "tokens_per_second_per_gpu": 16956.62, "total_tokens": 302486666 }, { "epoch": 0.19154788697174294, "grad_norm": 0.9232956767082214, "learning_rate": 2e-05, "loss": 0.7734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3064, "tokens_per_second_per_gpu": 18411.5, "total_tokens": 302588740 }, { "epoch": 0.19161040260065015, "grad_norm": 0.9347029328346252, "learning_rate": 2e-05, "loss": 0.7241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3065, "tokens_per_second_per_gpu": 16397.15, "total_tokens": 302683000 }, { "epoch": 0.1916729182295574, "grad_norm": 0.9403701424598694, "learning_rate": 2e-05, "loss": 0.6729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3066, "tokens_per_second_per_gpu": 17576.39, "total_tokens": 302778990 }, { "epoch": 0.19173543385846462, "grad_norm": 0.9449431896209717, "learning_rate": 2e-05, "loss": 0.7278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3067, "tokens_per_second_per_gpu": 18797.15, "total_tokens": 302878593 }, { "epoch": 0.19179794948737183, "grad_norm": 0.9114310145378113, "learning_rate": 2e-05, "loss": 0.7123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3068, "tokens_per_second_per_gpu": 18178.26, "total_tokens": 302980281 }, { "epoch": 0.19186046511627908, "grad_norm": 0.9716349840164185, "learning_rate": 2e-05, "loss": 0.6538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3069, "tokens_per_second_per_gpu": 15140.3, "total_tokens": 303069774 }, { "epoch": 0.1919229807451863, "grad_norm": 0.886228621006012, "learning_rate": 2e-05, "loss": 0.6961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3070, "tokens_per_second_per_gpu": 18072.17, "total_tokens": 303173440 }, { "epoch": 0.1919854963740935, "grad_norm": 1.04654860496521, "learning_rate": 2e-05, "loss": 0.7444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3071, "tokens_per_second_per_gpu": 16076.3, "total_tokens": 303269321 }, { "epoch": 0.19204801200300076, "grad_norm": 0.9297623634338379, "learning_rate": 2e-05, "loss": 0.6984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3072, "tokens_per_second_per_gpu": 18751.91, "total_tokens": 303370111 }, { "epoch": 0.19211052763190797, "grad_norm": 0.9094322919845581, "learning_rate": 2e-05, "loss": 0.6651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3073, "tokens_per_second_per_gpu": 16781.46, "total_tokens": 303466957 }, { "epoch": 0.1921730432608152, "grad_norm": 0.9529551863670349, "learning_rate": 2e-05, "loss": 0.7614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3074, "tokens_per_second_per_gpu": 17109.29, "total_tokens": 303561132 }, { "epoch": 0.19223555888972244, "grad_norm": 0.9057062864303589, "learning_rate": 2e-05, "loss": 0.7166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3075, "tokens_per_second_per_gpu": 17065.15, "total_tokens": 303660793 }, { "epoch": 0.19229807451862965, "grad_norm": 0.9582470655441284, "learning_rate": 2e-05, "loss": 0.7225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3076, "tokens_per_second_per_gpu": 16961.12, "total_tokens": 303755478 }, { "epoch": 0.1923605901475369, "grad_norm": 0.9094344973564148, "learning_rate": 2e-05, "loss": 0.7787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3077, "tokens_per_second_per_gpu": 18268.69, "total_tokens": 303857740 }, { "epoch": 0.19242310577644411, "grad_norm": 0.9233578443527222, "learning_rate": 2e-05, "loss": 0.7436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3078, "tokens_per_second_per_gpu": 17252.4, "total_tokens": 303958008 }, { "epoch": 0.19248562140535133, "grad_norm": 0.9154567122459412, "learning_rate": 2e-05, "loss": 0.7571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3079, "tokens_per_second_per_gpu": 16811.29, "total_tokens": 304054529 }, { "epoch": 0.19254813703425858, "grad_norm": 0.9665129780769348, "learning_rate": 2e-05, "loss": 0.703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3080, "tokens_per_second_per_gpu": 16875.77, "total_tokens": 304150713 }, { "epoch": 0.1926106526631658, "grad_norm": 0.9614959359169006, "learning_rate": 2e-05, "loss": 0.7343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3081, "tokens_per_second_per_gpu": 17692.05, "total_tokens": 304250771 }, { "epoch": 0.192673168292073, "grad_norm": 0.9366120100021362, "learning_rate": 2e-05, "loss": 0.7078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3082, "tokens_per_second_per_gpu": 15793.0, "total_tokens": 304342729 }, { "epoch": 0.19273568392098026, "grad_norm": 0.9119196534156799, "learning_rate": 2e-05, "loss": 0.7067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3083, "tokens_per_second_per_gpu": 17390.85, "total_tokens": 304440295 }, { "epoch": 0.19279819954988747, "grad_norm": 1.0062309503555298, "learning_rate": 2e-05, "loss": 0.7067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3084, "tokens_per_second_per_gpu": 17651.6, "total_tokens": 304537465 }, { "epoch": 0.1928607151787947, "grad_norm": 0.9580155611038208, "learning_rate": 2e-05, "loss": 0.7282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3085, "tokens_per_second_per_gpu": 17744.68, "total_tokens": 304636379 }, { "epoch": 0.19292323080770193, "grad_norm": 0.9300965070724487, "learning_rate": 2e-05, "loss": 0.7337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3086, "tokens_per_second_per_gpu": 17971.1, "total_tokens": 304736459 }, { "epoch": 0.19298574643660915, "grad_norm": 0.9096022248268127, "learning_rate": 2e-05, "loss": 0.7247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3087, "tokens_per_second_per_gpu": 17083.97, "total_tokens": 304832302 }, { "epoch": 0.19304826206551637, "grad_norm": 0.8913066387176514, "learning_rate": 2e-05, "loss": 0.6931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3088, "tokens_per_second_per_gpu": 17405.44, "total_tokens": 304927746 }, { "epoch": 0.1931107776944236, "grad_norm": 0.9554876089096069, "learning_rate": 2e-05, "loss": 0.7179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3089, "tokens_per_second_per_gpu": 17127.29, "total_tokens": 305025039 }, { "epoch": 0.19317329332333083, "grad_norm": 0.91298508644104, "learning_rate": 2e-05, "loss": 0.749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3090, "tokens_per_second_per_gpu": 17520.97, "total_tokens": 305126123 }, { "epoch": 0.19323580895223805, "grad_norm": 0.8946552872657776, "learning_rate": 2e-05, "loss": 0.695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3091, "tokens_per_second_per_gpu": 17069.27, "total_tokens": 305225096 }, { "epoch": 0.1932983245811453, "grad_norm": 1.0101306438446045, "learning_rate": 2e-05, "loss": 0.7399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3092, "tokens_per_second_per_gpu": 17104.88, "total_tokens": 305325908 }, { "epoch": 0.1933608402100525, "grad_norm": 0.94699627161026, "learning_rate": 2e-05, "loss": 0.7521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3093, "tokens_per_second_per_gpu": 18140.37, "total_tokens": 305426522 }, { "epoch": 0.19342335583895973, "grad_norm": 0.8932915329933167, "learning_rate": 2e-05, "loss": 0.6952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3094, "tokens_per_second_per_gpu": 16686.11, "total_tokens": 305523495 }, { "epoch": 0.19348587146786697, "grad_norm": 0.9148370027542114, "learning_rate": 2e-05, "loss": 0.7155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3095, "tokens_per_second_per_gpu": 16510.16, "total_tokens": 305621567 }, { "epoch": 0.1935483870967742, "grad_norm": 0.9016634821891785, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3096, "tokens_per_second_per_gpu": 16383.28, "total_tokens": 305718542 }, { "epoch": 0.19361090272568143, "grad_norm": 0.9913292527198792, "learning_rate": 2e-05, "loss": 0.7367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3097, "tokens_per_second_per_gpu": 15428.72, "total_tokens": 305812479 }, { "epoch": 0.19367341835458865, "grad_norm": 0.93324875831604, "learning_rate": 2e-05, "loss": 0.7306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3098, "tokens_per_second_per_gpu": 16625.37, "total_tokens": 305906638 }, { "epoch": 0.19373593398349587, "grad_norm": 0.9220231175422668, "learning_rate": 2e-05, "loss": 0.7221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3099, "tokens_per_second_per_gpu": 17872.22, "total_tokens": 306006718 }, { "epoch": 0.1937984496124031, "grad_norm": 0.9218279123306274, "learning_rate": 2e-05, "loss": 0.7209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3100, "tokens_per_second_per_gpu": 16129.22, "total_tokens": 306104814 }, { "epoch": 0.19386096524131033, "grad_norm": 0.9064011573791504, "learning_rate": 2e-05, "loss": 0.7115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3101, "tokens_per_second_per_gpu": 17397.05, "total_tokens": 306204427 }, { "epoch": 0.19392348087021755, "grad_norm": 1.0036883354187012, "learning_rate": 2e-05, "loss": 0.7192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3102, "tokens_per_second_per_gpu": 15728.07, "total_tokens": 306293349 }, { "epoch": 0.1939859964991248, "grad_norm": 0.9431085586547852, "learning_rate": 2e-05, "loss": 0.7981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3103, "tokens_per_second_per_gpu": 17185.71, "total_tokens": 306391438 }, { "epoch": 0.194048512128032, "grad_norm": 0.9099276661872864, "learning_rate": 2e-05, "loss": 0.7218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3104, "tokens_per_second_per_gpu": 17385.16, "total_tokens": 306490084 }, { "epoch": 0.19411102775693923, "grad_norm": 0.9299408197402954, "learning_rate": 2e-05, "loss": 0.7196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3105, "tokens_per_second_per_gpu": 17107.48, "total_tokens": 306591019 }, { "epoch": 0.19417354338584647, "grad_norm": 0.9038006663322449, "learning_rate": 2e-05, "loss": 0.6683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3106, "tokens_per_second_per_gpu": 15838.41, "total_tokens": 306681712 }, { "epoch": 0.1942360590147537, "grad_norm": 0.9702273607254028, "learning_rate": 2e-05, "loss": 0.6911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3107, "tokens_per_second_per_gpu": 16811.55, "total_tokens": 306775576 }, { "epoch": 0.1942985746436609, "grad_norm": 0.9092380404472351, "learning_rate": 2e-05, "loss": 0.6987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3108, "tokens_per_second_per_gpu": 16414.52, "total_tokens": 306871676 }, { "epoch": 0.19436109027256815, "grad_norm": 0.9483994245529175, "learning_rate": 2e-05, "loss": 0.6915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3109, "tokens_per_second_per_gpu": 17037.4, "total_tokens": 306969503 }, { "epoch": 0.19442360590147537, "grad_norm": 0.9027639627456665, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3110, "tokens_per_second_per_gpu": 15962.71, "total_tokens": 307060105 }, { "epoch": 0.19448612153038258, "grad_norm": 0.8984349370002747, "learning_rate": 2e-05, "loss": 0.7276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3111, "tokens_per_second_per_gpu": 17914.5, "total_tokens": 307162590 }, { "epoch": 0.19454863715928983, "grad_norm": 0.9740186333656311, "learning_rate": 2e-05, "loss": 0.7354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3112, "tokens_per_second_per_gpu": 16890.72, "total_tokens": 307262079 }, { "epoch": 0.19461115278819704, "grad_norm": 0.9901677370071411, "learning_rate": 2e-05, "loss": 0.7691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3113, "tokens_per_second_per_gpu": 17080.93, "total_tokens": 307360154 }, { "epoch": 0.1946736684171043, "grad_norm": 0.9490518569946289, "learning_rate": 2e-05, "loss": 0.6992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3114, "tokens_per_second_per_gpu": 16406.65, "total_tokens": 307454874 }, { "epoch": 0.1947361840460115, "grad_norm": 0.9826334118843079, "learning_rate": 2e-05, "loss": 0.7117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3115, "tokens_per_second_per_gpu": 15646.84, "total_tokens": 307546745 }, { "epoch": 0.19479869967491872, "grad_norm": 0.9240089058876038, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3116, "tokens_per_second_per_gpu": 17794.56, "total_tokens": 307645672 }, { "epoch": 0.19486121530382597, "grad_norm": 0.9012929797172546, "learning_rate": 2e-05, "loss": 0.7516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3117, "tokens_per_second_per_gpu": 17584.62, "total_tokens": 307743593 }, { "epoch": 0.19492373093273319, "grad_norm": 0.9689899682998657, "learning_rate": 2e-05, "loss": 0.7062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3118, "tokens_per_second_per_gpu": 16311.61, "total_tokens": 307839919 }, { "epoch": 0.1949862465616404, "grad_norm": 0.863365113735199, "learning_rate": 2e-05, "loss": 0.6731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3119, "tokens_per_second_per_gpu": 17820.83, "total_tokens": 307941121 }, { "epoch": 0.19504876219054765, "grad_norm": 0.8887298107147217, "learning_rate": 2e-05, "loss": 0.6706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3120, "tokens_per_second_per_gpu": 17570.88, "total_tokens": 308041404 }, { "epoch": 0.19511127781945486, "grad_norm": 0.8939189314842224, "learning_rate": 2e-05, "loss": 0.6626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3121, "tokens_per_second_per_gpu": 15934.18, "total_tokens": 308136955 }, { "epoch": 0.19517379344836208, "grad_norm": 0.8907235860824585, "learning_rate": 2e-05, "loss": 0.6626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3122, "tokens_per_second_per_gpu": 17652.52, "total_tokens": 308234834 }, { "epoch": 0.19523630907726933, "grad_norm": 0.8740129470825195, "learning_rate": 2e-05, "loss": 0.7068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3123, "tokens_per_second_per_gpu": 17870.28, "total_tokens": 308335858 }, { "epoch": 0.19529882470617654, "grad_norm": 0.8988009095191956, "learning_rate": 2e-05, "loss": 0.7144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3124, "tokens_per_second_per_gpu": 18324.81, "total_tokens": 308438610 }, { "epoch": 0.19536134033508376, "grad_norm": 0.9421533346176147, "learning_rate": 2e-05, "loss": 0.6846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3125, "tokens_per_second_per_gpu": 18149.49, "total_tokens": 308541602 }, { "epoch": 0.195423855963991, "grad_norm": 0.9071244597434998, "learning_rate": 2e-05, "loss": 0.7386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3126, "tokens_per_second_per_gpu": 17226.36, "total_tokens": 308637949 }, { "epoch": 0.19548637159289822, "grad_norm": 0.8739496469497681, "learning_rate": 2e-05, "loss": 0.7196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3127, "tokens_per_second_per_gpu": 17660.1, "total_tokens": 308736424 }, { "epoch": 0.19554888722180544, "grad_norm": 0.8966848254203796, "learning_rate": 2e-05, "loss": 0.7034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3128, "tokens_per_second_per_gpu": 17345.41, "total_tokens": 308833460 }, { "epoch": 0.19561140285071268, "grad_norm": 0.9455466866493225, "learning_rate": 2e-05, "loss": 0.7085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3129, "tokens_per_second_per_gpu": 16974.32, "total_tokens": 308932209 }, { "epoch": 0.1956739184796199, "grad_norm": 0.9204097390174866, "learning_rate": 2e-05, "loss": 0.7006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3130, "tokens_per_second_per_gpu": 17432.15, "total_tokens": 309033266 }, { "epoch": 0.19573643410852712, "grad_norm": 0.9338356852531433, "learning_rate": 2e-05, "loss": 0.7344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3131, "tokens_per_second_per_gpu": 17372.84, "total_tokens": 309135446 }, { "epoch": 0.19579894973743436, "grad_norm": 0.8983652591705322, "learning_rate": 2e-05, "loss": 0.692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3132, "tokens_per_second_per_gpu": 17972.96, "total_tokens": 309236902 }, { "epoch": 0.19586146536634158, "grad_norm": 0.8998593091964722, "learning_rate": 2e-05, "loss": 0.719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3133, "tokens_per_second_per_gpu": 16588.07, "total_tokens": 309335261 }, { "epoch": 0.19592398099524883, "grad_norm": 0.9156457185745239, "learning_rate": 2e-05, "loss": 0.6643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3134, "tokens_per_second_per_gpu": 17087.1, "total_tokens": 309431447 }, { "epoch": 0.19598649662415604, "grad_norm": 0.9152355790138245, "learning_rate": 2e-05, "loss": 0.7668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3135, "tokens_per_second_per_gpu": 17993.86, "total_tokens": 309533881 }, { "epoch": 0.19604901225306326, "grad_norm": 0.8873387575149536, "learning_rate": 2e-05, "loss": 0.7213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3136, "tokens_per_second_per_gpu": 18274.67, "total_tokens": 309636676 }, { "epoch": 0.1961115278819705, "grad_norm": 0.9004818797111511, "learning_rate": 2e-05, "loss": 0.703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3137, "tokens_per_second_per_gpu": 17963.31, "total_tokens": 309738444 }, { "epoch": 0.19617404351087772, "grad_norm": 0.8933883905410767, "learning_rate": 2e-05, "loss": 0.7527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3138, "tokens_per_second_per_gpu": 17921.0, "total_tokens": 309840137 }, { "epoch": 0.19623655913978494, "grad_norm": 0.9399624466896057, "learning_rate": 2e-05, "loss": 0.7419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3139, "tokens_per_second_per_gpu": 16792.85, "total_tokens": 309936136 }, { "epoch": 0.19629907476869218, "grad_norm": 0.9413822889328003, "learning_rate": 2e-05, "loss": 0.7077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3140, "tokens_per_second_per_gpu": 17238.78, "total_tokens": 310038058 }, { "epoch": 0.1963615903975994, "grad_norm": 0.9021515250205994, "learning_rate": 2e-05, "loss": 0.7296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3141, "tokens_per_second_per_gpu": 18227.71, "total_tokens": 310139036 }, { "epoch": 0.19642410602650662, "grad_norm": 0.8913530707359314, "learning_rate": 2e-05, "loss": 0.7009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3142, "tokens_per_second_per_gpu": 18589.43, "total_tokens": 310242153 }, { "epoch": 0.19648662165541386, "grad_norm": 0.9006868004798889, "learning_rate": 2e-05, "loss": 0.7034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3143, "tokens_per_second_per_gpu": 17046.54, "total_tokens": 310340060 }, { "epoch": 0.19654913728432108, "grad_norm": 0.9066780209541321, "learning_rate": 2e-05, "loss": 0.7212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3144, "tokens_per_second_per_gpu": 18016.91, "total_tokens": 310437832 }, { "epoch": 0.1966116529132283, "grad_norm": 0.8909957408905029, "learning_rate": 2e-05, "loss": 0.6908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3145, "tokens_per_second_per_gpu": 17846.37, "total_tokens": 310537660 }, { "epoch": 0.19667416854213554, "grad_norm": 0.8721911907196045, "learning_rate": 2e-05, "loss": 0.7253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3146, "tokens_per_second_per_gpu": 17890.1, "total_tokens": 310640097 }, { "epoch": 0.19673668417104276, "grad_norm": 0.8834393620491028, "learning_rate": 2e-05, "loss": 0.7097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3147, "tokens_per_second_per_gpu": 18572.83, "total_tokens": 310741030 }, { "epoch": 0.19679919979994998, "grad_norm": 0.898649275302887, "learning_rate": 2e-05, "loss": 0.7067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3148, "tokens_per_second_per_gpu": 17916.18, "total_tokens": 310843530 }, { "epoch": 0.19686171542885722, "grad_norm": 0.8553916215896606, "learning_rate": 2e-05, "loss": 0.7302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3149, "tokens_per_second_per_gpu": 18901.32, "total_tokens": 310949373 }, { "epoch": 0.19692423105776444, "grad_norm": 0.8726934790611267, "learning_rate": 2e-05, "loss": 0.7153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3150, "tokens_per_second_per_gpu": 17856.44, "total_tokens": 311056500 }, { "epoch": 0.19698674668667165, "grad_norm": 0.8565430045127869, "learning_rate": 2e-05, "loss": 0.6788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3151, "tokens_per_second_per_gpu": 17721.38, "total_tokens": 311158302 }, { "epoch": 0.1970492623155789, "grad_norm": 0.8969045281410217, "learning_rate": 2e-05, "loss": 0.6842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3152, "tokens_per_second_per_gpu": 16321.57, "total_tokens": 311252156 }, { "epoch": 0.19711177794448612, "grad_norm": 0.9241505265235901, "learning_rate": 2e-05, "loss": 0.7108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3153, "tokens_per_second_per_gpu": 17841.21, "total_tokens": 311352294 }, { "epoch": 0.19717429357339336, "grad_norm": 0.9661359190940857, "learning_rate": 2e-05, "loss": 0.7256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3154, "tokens_per_second_per_gpu": 16855.68, "total_tokens": 311448801 }, { "epoch": 0.19723680920230058, "grad_norm": 0.9038260579109192, "learning_rate": 2e-05, "loss": 0.748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3155, "tokens_per_second_per_gpu": 17229.43, "total_tokens": 311550554 }, { "epoch": 0.1972993248312078, "grad_norm": 0.9127503633499146, "learning_rate": 2e-05, "loss": 0.7174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3156, "tokens_per_second_per_gpu": 17303.77, "total_tokens": 311646721 }, { "epoch": 0.19736184046011504, "grad_norm": 0.909277081489563, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3157, "tokens_per_second_per_gpu": 16958.6, "total_tokens": 311742074 }, { "epoch": 0.19742435608902226, "grad_norm": 0.9312441349029541, "learning_rate": 2e-05, "loss": 0.6847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3158, "tokens_per_second_per_gpu": 17670.83, "total_tokens": 311837063 }, { "epoch": 0.19748687171792947, "grad_norm": 0.8994888663291931, "learning_rate": 2e-05, "loss": 0.7325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3159, "tokens_per_second_per_gpu": 17345.88, "total_tokens": 311938513 }, { "epoch": 0.19754938734683672, "grad_norm": 0.9742658138275146, "learning_rate": 2e-05, "loss": 0.6978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3160, "tokens_per_second_per_gpu": 16656.46, "total_tokens": 312038562 }, { "epoch": 0.19761190297574394, "grad_norm": 0.8656207919120789, "learning_rate": 2e-05, "loss": 0.6671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3161, "tokens_per_second_per_gpu": 17528.4, "total_tokens": 312137052 }, { "epoch": 0.19767441860465115, "grad_norm": 1.0035030841827393, "learning_rate": 2e-05, "loss": 0.6984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3162, "tokens_per_second_per_gpu": 16786.26, "total_tokens": 312232371 }, { "epoch": 0.1977369342335584, "grad_norm": 0.9075307846069336, "learning_rate": 2e-05, "loss": 0.7146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3163, "tokens_per_second_per_gpu": 17241.52, "total_tokens": 312329838 }, { "epoch": 0.19779944986246562, "grad_norm": 0.8823544979095459, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3164, "tokens_per_second_per_gpu": 17100.76, "total_tokens": 312427787 }, { "epoch": 0.19786196549137283, "grad_norm": 0.9865087866783142, "learning_rate": 2e-05, "loss": 0.6867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3165, "tokens_per_second_per_gpu": 16701.23, "total_tokens": 312522993 }, { "epoch": 0.19792448112028008, "grad_norm": 0.9553170204162598, "learning_rate": 2e-05, "loss": 0.7145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3166, "tokens_per_second_per_gpu": 18222.46, "total_tokens": 312625751 }, { "epoch": 0.1979869967491873, "grad_norm": 0.9101760387420654, "learning_rate": 2e-05, "loss": 0.7242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3167, "tokens_per_second_per_gpu": 17494.62, "total_tokens": 312723910 }, { "epoch": 0.1980495123780945, "grad_norm": 0.9019327163696289, "learning_rate": 2e-05, "loss": 0.6882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3168, "tokens_per_second_per_gpu": 17378.14, "total_tokens": 312823073 }, { "epoch": 0.19811202800700176, "grad_norm": 0.9785290360450745, "learning_rate": 2e-05, "loss": 0.7281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3169, "tokens_per_second_per_gpu": 18588.8, "total_tokens": 312925752 }, { "epoch": 0.19817454363590897, "grad_norm": 0.8634876608848572, "learning_rate": 2e-05, "loss": 0.7156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3170, "tokens_per_second_per_gpu": 17933.09, "total_tokens": 313027106 }, { "epoch": 0.19823705926481622, "grad_norm": 0.9243949055671692, "learning_rate": 2e-05, "loss": 0.711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3171, "tokens_per_second_per_gpu": 17244.74, "total_tokens": 313125307 }, { "epoch": 0.19829957489372343, "grad_norm": 0.8933592438697815, "learning_rate": 2e-05, "loss": 0.7132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3172, "tokens_per_second_per_gpu": 17870.52, "total_tokens": 313220879 }, { "epoch": 0.19836209052263065, "grad_norm": 0.9147912859916687, "learning_rate": 2e-05, "loss": 0.7227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3173, "tokens_per_second_per_gpu": 16271.11, "total_tokens": 313318136 }, { "epoch": 0.1984246061515379, "grad_norm": 0.9376128911972046, "learning_rate": 2e-05, "loss": 0.7079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3174, "tokens_per_second_per_gpu": 17773.04, "total_tokens": 313417328 }, { "epoch": 0.1984871217804451, "grad_norm": 0.863564133644104, "learning_rate": 2e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3175, "tokens_per_second_per_gpu": 16900.51, "total_tokens": 313514299 }, { "epoch": 0.19854963740935233, "grad_norm": 0.8847078084945679, "learning_rate": 2e-05, "loss": 0.6718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3176, "tokens_per_second_per_gpu": 17269.71, "total_tokens": 313614852 }, { "epoch": 0.19861215303825958, "grad_norm": 0.8795254826545715, "learning_rate": 2e-05, "loss": 0.733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3177, "tokens_per_second_per_gpu": 17793.1, "total_tokens": 313716812 }, { "epoch": 0.1986746686671668, "grad_norm": 0.8716992735862732, "learning_rate": 2e-05, "loss": 0.698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3178, "tokens_per_second_per_gpu": 16797.05, "total_tokens": 313811888 }, { "epoch": 0.198737184296074, "grad_norm": 0.9374571442604065, "learning_rate": 2e-05, "loss": 0.7175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3179, "tokens_per_second_per_gpu": 17537.75, "total_tokens": 313912500 }, { "epoch": 0.19879969992498125, "grad_norm": 0.9529299139976501, "learning_rate": 2e-05, "loss": 0.715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3180, "tokens_per_second_per_gpu": 16392.71, "total_tokens": 314011561 }, { "epoch": 0.19886221555388847, "grad_norm": 0.9325137138366699, "learning_rate": 2e-05, "loss": 0.7072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3181, "tokens_per_second_per_gpu": 16338.99, "total_tokens": 314109518 }, { "epoch": 0.1989247311827957, "grad_norm": 0.9964168667793274, "learning_rate": 2e-05, "loss": 0.728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3182, "tokens_per_second_per_gpu": 17413.56, "total_tokens": 314208502 }, { "epoch": 0.19898724681170293, "grad_norm": 0.8899219036102295, "learning_rate": 2e-05, "loss": 0.6938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3183, "tokens_per_second_per_gpu": 17237.54, "total_tokens": 314310225 }, { "epoch": 0.19904976244061015, "grad_norm": 0.9052229523658752, "learning_rate": 2e-05, "loss": 0.7158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3184, "tokens_per_second_per_gpu": 17075.11, "total_tokens": 314409185 }, { "epoch": 0.19911227806951737, "grad_norm": 0.8998997211456299, "learning_rate": 2e-05, "loss": 0.7233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3185, "tokens_per_second_per_gpu": 17510.3, "total_tokens": 314509499 }, { "epoch": 0.1991747936984246, "grad_norm": 0.9234640598297119, "learning_rate": 2e-05, "loss": 0.7518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3186, "tokens_per_second_per_gpu": 17475.48, "total_tokens": 314609454 }, { "epoch": 0.19923730932733183, "grad_norm": 0.9315038323402405, "learning_rate": 2e-05, "loss": 0.7242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3187, "tokens_per_second_per_gpu": 15594.91, "total_tokens": 314703634 }, { "epoch": 0.19929982495623905, "grad_norm": 0.939637303352356, "learning_rate": 2e-05, "loss": 0.677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3188, "tokens_per_second_per_gpu": 16919.66, "total_tokens": 314797104 }, { "epoch": 0.1993623405851463, "grad_norm": 0.9075549244880676, "learning_rate": 2e-05, "loss": 0.7028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3189, "tokens_per_second_per_gpu": 16471.77, "total_tokens": 314894527 }, { "epoch": 0.1994248562140535, "grad_norm": 0.9504323601722717, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3190, "tokens_per_second_per_gpu": 17869.86, "total_tokens": 314993117 }, { "epoch": 0.19948737184296075, "grad_norm": 0.9290003180503845, "learning_rate": 2e-05, "loss": 0.7082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3191, "tokens_per_second_per_gpu": 17459.9, "total_tokens": 315093774 }, { "epoch": 0.19954988747186797, "grad_norm": 0.9505457878112793, "learning_rate": 2e-05, "loss": 0.7378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3192, "tokens_per_second_per_gpu": 17133.05, "total_tokens": 315192456 }, { "epoch": 0.1996124031007752, "grad_norm": 0.9189764857292175, "learning_rate": 2e-05, "loss": 0.7079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3193, "tokens_per_second_per_gpu": 16622.84, "total_tokens": 315291230 }, { "epoch": 0.19967491872968243, "grad_norm": 0.9816626906394958, "learning_rate": 2e-05, "loss": 0.7046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3194, "tokens_per_second_per_gpu": 15589.67, "total_tokens": 315388599 }, { "epoch": 0.19973743435858965, "grad_norm": 0.9191132187843323, "learning_rate": 2e-05, "loss": 0.7158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3195, "tokens_per_second_per_gpu": 19067.13, "total_tokens": 315490297 }, { "epoch": 0.19979994998749687, "grad_norm": 0.899457573890686, "learning_rate": 2e-05, "loss": 0.7003, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3196, "tokens_per_second_per_gpu": 17544.89, "total_tokens": 315591553 }, { "epoch": 0.1998624656164041, "grad_norm": 0.986318826675415, "learning_rate": 2e-05, "loss": 0.7615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3197, "tokens_per_second_per_gpu": 17506.71, "total_tokens": 315693149 }, { "epoch": 0.19992498124531133, "grad_norm": 0.9451489448547363, "learning_rate": 2e-05, "loss": 0.7447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3198, "tokens_per_second_per_gpu": 18039.22, "total_tokens": 315795619 }, { "epoch": 0.19998749687421855, "grad_norm": 0.8826286792755127, "learning_rate": 2e-05, "loss": 0.6823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3199, "tokens_per_second_per_gpu": 17567.02, "total_tokens": 315897274 }, { "epoch": 0.2000500125031258, "grad_norm": 0.8988046050071716, "learning_rate": 2e-05, "loss": 0.7175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3200, "tokens_per_second_per_gpu": 16848.68, "total_tokens": 315992544 }, { "epoch": 0.200112528132033, "grad_norm": 0.9196089506149292, "learning_rate": 2e-05, "loss": 0.7597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3201, "tokens_per_second_per_gpu": 18087.44, "total_tokens": 316093683 }, { "epoch": 0.20017504376094022, "grad_norm": 0.8704236745834351, "learning_rate": 2e-05, "loss": 0.7152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3202, "tokens_per_second_per_gpu": 17438.03, "total_tokens": 316195227 }, { "epoch": 0.20023755938984747, "grad_norm": 0.8712000846862793, "learning_rate": 2e-05, "loss": 0.6891, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3203, "tokens_per_second_per_gpu": 17847.45, "total_tokens": 316297828 }, { "epoch": 0.2003000750187547, "grad_norm": 0.9373867511749268, "learning_rate": 2e-05, "loss": 0.6998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3204, "tokens_per_second_per_gpu": 17521.83, "total_tokens": 316396163 }, { "epoch": 0.2003625906476619, "grad_norm": 0.9113324880599976, "learning_rate": 2e-05, "loss": 0.7318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3205, "tokens_per_second_per_gpu": 17688.38, "total_tokens": 316493885 }, { "epoch": 0.20042510627656915, "grad_norm": 0.9656354188919067, "learning_rate": 2e-05, "loss": 0.7505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3206, "tokens_per_second_per_gpu": 16837.77, "total_tokens": 316595940 }, { "epoch": 0.20048762190547637, "grad_norm": 0.9934861660003662, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3207, "tokens_per_second_per_gpu": 18240.85, "total_tokens": 316697141 }, { "epoch": 0.20055013753438358, "grad_norm": 0.937875509262085, "learning_rate": 2e-05, "loss": 0.6919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3208, "tokens_per_second_per_gpu": 16937.75, "total_tokens": 316793260 }, { "epoch": 0.20061265316329083, "grad_norm": 0.9043986797332764, "learning_rate": 2e-05, "loss": 0.6864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3209, "tokens_per_second_per_gpu": 17000.04, "total_tokens": 316891201 }, { "epoch": 0.20067516879219804, "grad_norm": 0.9664772152900696, "learning_rate": 2e-05, "loss": 0.7244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3210, "tokens_per_second_per_gpu": 16734.15, "total_tokens": 316991160 }, { "epoch": 0.2007376844211053, "grad_norm": 1.0226291418075562, "learning_rate": 2e-05, "loss": 0.7242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3211, "tokens_per_second_per_gpu": 17669.28, "total_tokens": 317091674 }, { "epoch": 0.2008002000500125, "grad_norm": 0.9482648968696594, "learning_rate": 2e-05, "loss": 0.7152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3212, "tokens_per_second_per_gpu": 17471.31, "total_tokens": 317189504 }, { "epoch": 0.20086271567891972, "grad_norm": 0.9081990718841553, "learning_rate": 2e-05, "loss": 0.7024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3213, "tokens_per_second_per_gpu": 16270.75, "total_tokens": 317288299 }, { "epoch": 0.20092523130782697, "grad_norm": 1.069034218788147, "learning_rate": 2e-05, "loss": 0.7152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3214, "tokens_per_second_per_gpu": 16709.61, "total_tokens": 317387033 }, { "epoch": 0.20098774693673419, "grad_norm": 0.9284775257110596, "learning_rate": 2e-05, "loss": 0.6935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3215, "tokens_per_second_per_gpu": 17729.23, "total_tokens": 317485073 }, { "epoch": 0.2010502625656414, "grad_norm": 0.9152222275733948, "learning_rate": 2e-05, "loss": 0.7278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3216, "tokens_per_second_per_gpu": 18128.03, "total_tokens": 317587401 }, { "epoch": 0.20111277819454865, "grad_norm": 0.9614871144294739, "learning_rate": 2e-05, "loss": 0.7629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3217, "tokens_per_second_per_gpu": 18215.25, "total_tokens": 317690728 }, { "epoch": 0.20117529382345586, "grad_norm": 0.9163969159126282, "learning_rate": 2e-05, "loss": 0.7137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3218, "tokens_per_second_per_gpu": 17260.94, "total_tokens": 317789912 }, { "epoch": 0.20123780945236308, "grad_norm": 0.9146594405174255, "learning_rate": 2e-05, "loss": 0.7521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3219, "tokens_per_second_per_gpu": 17998.41, "total_tokens": 317890856 }, { "epoch": 0.20130032508127033, "grad_norm": 0.8522748947143555, "learning_rate": 2e-05, "loss": 0.7186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3220, "tokens_per_second_per_gpu": 18399.79, "total_tokens": 317996579 }, { "epoch": 0.20136284071017754, "grad_norm": 0.9268032312393188, "learning_rate": 2e-05, "loss": 0.7175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3221, "tokens_per_second_per_gpu": 15838.63, "total_tokens": 318091249 }, { "epoch": 0.20142535633908476, "grad_norm": 0.9327215552330017, "learning_rate": 2e-05, "loss": 0.7773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3222, "tokens_per_second_per_gpu": 18007.03, "total_tokens": 318190520 }, { "epoch": 0.201487871967992, "grad_norm": 0.9133601784706116, "learning_rate": 2e-05, "loss": 0.6832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3223, "tokens_per_second_per_gpu": 17710.9, "total_tokens": 318287937 }, { "epoch": 0.20155038759689922, "grad_norm": 0.9514697194099426, "learning_rate": 2e-05, "loss": 0.7468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3224, "tokens_per_second_per_gpu": 16113.28, "total_tokens": 318381786 }, { "epoch": 0.20161290322580644, "grad_norm": 0.9037600159645081, "learning_rate": 2e-05, "loss": 0.7224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3225, "tokens_per_second_per_gpu": 16714.7, "total_tokens": 318478186 }, { "epoch": 0.20167541885471368, "grad_norm": 0.914962649345398, "learning_rate": 2e-05, "loss": 0.7379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3226, "tokens_per_second_per_gpu": 18540.47, "total_tokens": 318579088 }, { "epoch": 0.2017379344836209, "grad_norm": 0.913107693195343, "learning_rate": 2e-05, "loss": 0.719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3227, "tokens_per_second_per_gpu": 18433.88, "total_tokens": 318680637 }, { "epoch": 0.20180045011252815, "grad_norm": 0.9284030199050903, "learning_rate": 2e-05, "loss": 0.7187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3228, "tokens_per_second_per_gpu": 17493.84, "total_tokens": 318779116 }, { "epoch": 0.20186296574143536, "grad_norm": 0.8778671026229858, "learning_rate": 2e-05, "loss": 0.7102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3229, "tokens_per_second_per_gpu": 17347.04, "total_tokens": 318878241 }, { "epoch": 0.20192548137034258, "grad_norm": 0.9460369944572449, "learning_rate": 2e-05, "loss": 0.7091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3230, "tokens_per_second_per_gpu": 17658.1, "total_tokens": 318977523 }, { "epoch": 0.20198799699924982, "grad_norm": 0.8687376379966736, "learning_rate": 2e-05, "loss": 0.7264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3231, "tokens_per_second_per_gpu": 18941.67, "total_tokens": 319083060 }, { "epoch": 0.20205051262815704, "grad_norm": 0.9346702098846436, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3232, "tokens_per_second_per_gpu": 17656.46, "total_tokens": 319183283 }, { "epoch": 0.20211302825706426, "grad_norm": 0.9303419589996338, "learning_rate": 2e-05, "loss": 0.7098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3233, "tokens_per_second_per_gpu": 16755.35, "total_tokens": 319278741 }, { "epoch": 0.2021755438859715, "grad_norm": 0.920996367931366, "learning_rate": 2e-05, "loss": 0.6989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3234, "tokens_per_second_per_gpu": 16234.2, "total_tokens": 319376472 }, { "epoch": 0.20223805951487872, "grad_norm": 0.9156090021133423, "learning_rate": 2e-05, "loss": 0.7144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3235, "tokens_per_second_per_gpu": 17787.36, "total_tokens": 319474791 }, { "epoch": 0.20230057514378594, "grad_norm": 0.9254290461540222, "learning_rate": 2e-05, "loss": 0.7112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3236, "tokens_per_second_per_gpu": 18123.12, "total_tokens": 319574976 }, { "epoch": 0.20236309077269318, "grad_norm": 0.9250528812408447, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3237, "tokens_per_second_per_gpu": 15975.05, "total_tokens": 319669181 }, { "epoch": 0.2024256064016004, "grad_norm": 0.8825724720954895, "learning_rate": 2e-05, "loss": 0.6995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3238, "tokens_per_second_per_gpu": 17913.09, "total_tokens": 319768994 }, { "epoch": 0.20248812203050762, "grad_norm": 0.927800178527832, "learning_rate": 2e-05, "loss": 0.7458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3239, "tokens_per_second_per_gpu": 18127.38, "total_tokens": 319867925 }, { "epoch": 0.20255063765941486, "grad_norm": 0.9722017049789429, "learning_rate": 2e-05, "loss": 0.7321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3240, "tokens_per_second_per_gpu": 16524.21, "total_tokens": 319959932 }, { "epoch": 0.20261315328832208, "grad_norm": 0.9169378280639648, "learning_rate": 2e-05, "loss": 0.71, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3241, "tokens_per_second_per_gpu": 17740.07, "total_tokens": 320059868 }, { "epoch": 0.2026756689172293, "grad_norm": 0.8728071451187134, "learning_rate": 2e-05, "loss": 0.6885, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3242, "tokens_per_second_per_gpu": 18126.0, "total_tokens": 320160523 }, { "epoch": 0.20273818454613654, "grad_norm": 0.9434604048728943, "learning_rate": 2e-05, "loss": 0.7072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3243, "tokens_per_second_per_gpu": 16873.43, "total_tokens": 320259862 }, { "epoch": 0.20280070017504376, "grad_norm": 0.898199737071991, "learning_rate": 2e-05, "loss": 0.7231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3244, "tokens_per_second_per_gpu": 18108.67, "total_tokens": 320360426 }, { "epoch": 0.20286321580395097, "grad_norm": 0.898019552230835, "learning_rate": 2e-05, "loss": 0.7059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3245, "tokens_per_second_per_gpu": 17462.55, "total_tokens": 320462610 }, { "epoch": 0.20292573143285822, "grad_norm": 0.8910817503929138, "learning_rate": 2e-05, "loss": 0.701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3246, "tokens_per_second_per_gpu": 17746.97, "total_tokens": 320563211 }, { "epoch": 0.20298824706176544, "grad_norm": 0.9461768269538879, "learning_rate": 2e-05, "loss": 0.727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3247, "tokens_per_second_per_gpu": 17891.23, "total_tokens": 320664166 }, { "epoch": 0.20305076269067268, "grad_norm": 0.950993537902832, "learning_rate": 2e-05, "loss": 0.705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3248, "tokens_per_second_per_gpu": 16208.61, "total_tokens": 320759156 }, { "epoch": 0.2031132783195799, "grad_norm": 0.8675346970558167, "learning_rate": 2e-05, "loss": 0.6975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3249, "tokens_per_second_per_gpu": 18020.67, "total_tokens": 320863483 }, { "epoch": 0.20317579394848712, "grad_norm": 0.9047886729240417, "learning_rate": 2e-05, "loss": 0.7455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3250, "tokens_per_second_per_gpu": 17522.14, "total_tokens": 320961747 }, { "epoch": 0.20323830957739436, "grad_norm": 0.9149249792098999, "learning_rate": 2e-05, "loss": 0.7058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3251, "tokens_per_second_per_gpu": 16939.19, "total_tokens": 321060024 }, { "epoch": 0.20330082520630158, "grad_norm": 0.9252449870109558, "learning_rate": 2e-05, "loss": 0.7386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3252, "tokens_per_second_per_gpu": 18732.3, "total_tokens": 321162509 }, { "epoch": 0.2033633408352088, "grad_norm": 0.9269047379493713, "learning_rate": 2e-05, "loss": 0.663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3253, "tokens_per_second_per_gpu": 16931.84, "total_tokens": 321257209 }, { "epoch": 0.20342585646411604, "grad_norm": 0.9273307919502258, "learning_rate": 2e-05, "loss": 0.6998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3254, "tokens_per_second_per_gpu": 17208.55, "total_tokens": 321355131 }, { "epoch": 0.20348837209302326, "grad_norm": 0.9376100897789001, "learning_rate": 2e-05, "loss": 0.7358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3255, "tokens_per_second_per_gpu": 16419.57, "total_tokens": 321449286 }, { "epoch": 0.20355088772193047, "grad_norm": 0.89688640832901, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3256, "tokens_per_second_per_gpu": 17243.07, "total_tokens": 321545635 }, { "epoch": 0.20361340335083772, "grad_norm": 0.9206106066703796, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3257, "tokens_per_second_per_gpu": 16646.32, "total_tokens": 321639423 }, { "epoch": 0.20367591897974494, "grad_norm": 0.9617659449577332, "learning_rate": 2e-05, "loss": 0.7372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3258, "tokens_per_second_per_gpu": 18187.35, "total_tokens": 321740184 }, { "epoch": 0.20373843460865215, "grad_norm": 0.9361881017684937, "learning_rate": 2e-05, "loss": 0.7044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3259, "tokens_per_second_per_gpu": 16570.62, "total_tokens": 321840721 }, { "epoch": 0.2038009502375594, "grad_norm": 0.8606578707695007, "learning_rate": 2e-05, "loss": 0.6771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3260, "tokens_per_second_per_gpu": 16096.33, "total_tokens": 321936828 }, { "epoch": 0.20386346586646661, "grad_norm": 0.8826754689216614, "learning_rate": 2e-05, "loss": 0.7148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3261, "tokens_per_second_per_gpu": 18037.89, "total_tokens": 322040478 }, { "epoch": 0.20392598149537383, "grad_norm": 0.9124553203582764, "learning_rate": 2e-05, "loss": 0.7135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3262, "tokens_per_second_per_gpu": 17029.2, "total_tokens": 322140478 }, { "epoch": 0.20398849712428108, "grad_norm": 0.9366720914840698, "learning_rate": 2e-05, "loss": 0.7212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3263, "tokens_per_second_per_gpu": 15771.23, "total_tokens": 322236995 }, { "epoch": 0.2040510127531883, "grad_norm": 0.954585075378418, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3264, "tokens_per_second_per_gpu": 17989.15, "total_tokens": 322334332 }, { "epoch": 0.2041135283820955, "grad_norm": 0.8934177756309509, "learning_rate": 2e-05, "loss": 0.7123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3265, "tokens_per_second_per_gpu": 17227.85, "total_tokens": 322433010 }, { "epoch": 0.20417604401100276, "grad_norm": 0.9304990172386169, "learning_rate": 2e-05, "loss": 0.7669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3266, "tokens_per_second_per_gpu": 17608.09, "total_tokens": 322535274 }, { "epoch": 0.20423855963990997, "grad_norm": 0.9496126770973206, "learning_rate": 2e-05, "loss": 0.7392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3267, "tokens_per_second_per_gpu": 16477.68, "total_tokens": 322630785 }, { "epoch": 0.20430107526881722, "grad_norm": 0.9360702037811279, "learning_rate": 2e-05, "loss": 0.7391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3268, "tokens_per_second_per_gpu": 17394.67, "total_tokens": 322728783 }, { "epoch": 0.20436359089772443, "grad_norm": 1.0020344257354736, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3269, "tokens_per_second_per_gpu": 16591.78, "total_tokens": 322823719 }, { "epoch": 0.20442610652663165, "grad_norm": 0.9656937122344971, "learning_rate": 2e-05, "loss": 0.7078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3270, "tokens_per_second_per_gpu": 17248.7, "total_tokens": 322921581 }, { "epoch": 0.2044886221555389, "grad_norm": 0.934312641620636, "learning_rate": 2e-05, "loss": 0.717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3271, "tokens_per_second_per_gpu": 17072.96, "total_tokens": 323022964 }, { "epoch": 0.2045511377844461, "grad_norm": 0.9397894740104675, "learning_rate": 2e-05, "loss": 0.6882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3272, "tokens_per_second_per_gpu": 17906.96, "total_tokens": 323121790 }, { "epoch": 0.20461365341335333, "grad_norm": 0.9903281331062317, "learning_rate": 2e-05, "loss": 0.7076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3273, "tokens_per_second_per_gpu": 18121.75, "total_tokens": 323225226 }, { "epoch": 0.20467616904226058, "grad_norm": 0.9516851305961609, "learning_rate": 2e-05, "loss": 0.7138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3274, "tokens_per_second_per_gpu": 17356.94, "total_tokens": 323326217 }, { "epoch": 0.2047386846711678, "grad_norm": 0.9037448763847351, "learning_rate": 2e-05, "loss": 0.6953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3275, "tokens_per_second_per_gpu": 18305.17, "total_tokens": 323428115 }, { "epoch": 0.204801200300075, "grad_norm": 0.9388135075569153, "learning_rate": 2e-05, "loss": 0.7239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3276, "tokens_per_second_per_gpu": 17484.71, "total_tokens": 323529183 }, { "epoch": 0.20486371592898225, "grad_norm": 0.9747670888900757, "learning_rate": 2e-05, "loss": 0.738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3277, "tokens_per_second_per_gpu": 17249.14, "total_tokens": 323629024 }, { "epoch": 0.20492623155788947, "grad_norm": 0.9694965481758118, "learning_rate": 2e-05, "loss": 0.7173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3278, "tokens_per_second_per_gpu": 16892.63, "total_tokens": 323725030 }, { "epoch": 0.2049887471867967, "grad_norm": 0.8927041292190552, "learning_rate": 2e-05, "loss": 0.7127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3279, "tokens_per_second_per_gpu": 18070.37, "total_tokens": 323828278 }, { "epoch": 0.20505126281570393, "grad_norm": 0.9264627695083618, "learning_rate": 2e-05, "loss": 0.7451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3280, "tokens_per_second_per_gpu": 17491.04, "total_tokens": 323928166 }, { "epoch": 0.20511377844461115, "grad_norm": 0.9394402503967285, "learning_rate": 2e-05, "loss": 0.696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3281, "tokens_per_second_per_gpu": 17575.13, "total_tokens": 324028457 }, { "epoch": 0.20517629407351837, "grad_norm": 0.9882702827453613, "learning_rate": 2e-05, "loss": 0.6977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3282, "tokens_per_second_per_gpu": 17299.53, "total_tokens": 324124546 }, { "epoch": 0.2052388097024256, "grad_norm": 0.9233112931251526, "learning_rate": 2e-05, "loss": 0.722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3283, "tokens_per_second_per_gpu": 17454.77, "total_tokens": 324223632 }, { "epoch": 0.20530132533133283, "grad_norm": 0.8853139877319336, "learning_rate": 2e-05, "loss": 0.7159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3284, "tokens_per_second_per_gpu": 17349.0, "total_tokens": 324323867 }, { "epoch": 0.20536384096024007, "grad_norm": 0.9254509210586548, "learning_rate": 2e-05, "loss": 0.6804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3285, "tokens_per_second_per_gpu": 18832.91, "total_tokens": 324428143 }, { "epoch": 0.2054263565891473, "grad_norm": 0.952602207660675, "learning_rate": 2e-05, "loss": 0.7319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3286, "tokens_per_second_per_gpu": 17091.59, "total_tokens": 324528373 }, { "epoch": 0.2054888722180545, "grad_norm": 0.8913096189498901, "learning_rate": 2e-05, "loss": 0.706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3287, "tokens_per_second_per_gpu": 17756.85, "total_tokens": 324627441 }, { "epoch": 0.20555138784696175, "grad_norm": 0.8915871381759644, "learning_rate": 2e-05, "loss": 0.718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3288, "tokens_per_second_per_gpu": 17367.73, "total_tokens": 324728138 }, { "epoch": 0.20561390347586897, "grad_norm": 0.929375410079956, "learning_rate": 2e-05, "loss": 0.7627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3289, "tokens_per_second_per_gpu": 18304.02, "total_tokens": 324829396 }, { "epoch": 0.2056764191047762, "grad_norm": 0.8911022543907166, "learning_rate": 2e-05, "loss": 0.7417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3290, "tokens_per_second_per_gpu": 17913.4, "total_tokens": 324932075 }, { "epoch": 0.20573893473368343, "grad_norm": 0.8916997909545898, "learning_rate": 2e-05, "loss": 0.7064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3291, "tokens_per_second_per_gpu": 17429.95, "total_tokens": 325029694 }, { "epoch": 0.20580145036259065, "grad_norm": 0.8610087037086487, "learning_rate": 2e-05, "loss": 0.6968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3292, "tokens_per_second_per_gpu": 17964.52, "total_tokens": 325131125 }, { "epoch": 0.20586396599149787, "grad_norm": 0.922600269317627, "learning_rate": 2e-05, "loss": 0.7151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3293, "tokens_per_second_per_gpu": 17972.97, "total_tokens": 325230145 }, { "epoch": 0.2059264816204051, "grad_norm": 0.9099314212799072, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3294, "tokens_per_second_per_gpu": 18333.57, "total_tokens": 325327641 }, { "epoch": 0.20598899724931233, "grad_norm": 0.9313685297966003, "learning_rate": 2e-05, "loss": 0.6766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3295, "tokens_per_second_per_gpu": 16703.98, "total_tokens": 325424393 }, { "epoch": 0.20605151287821954, "grad_norm": 0.9088039398193359, "learning_rate": 2e-05, "loss": 0.6977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3296, "tokens_per_second_per_gpu": 17608.81, "total_tokens": 325525399 }, { "epoch": 0.2061140285071268, "grad_norm": 0.9778323173522949, "learning_rate": 2e-05, "loss": 0.7229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3297, "tokens_per_second_per_gpu": 17834.05, "total_tokens": 325625877 }, { "epoch": 0.206176544136034, "grad_norm": 0.9283567667007446, "learning_rate": 2e-05, "loss": 0.7566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3298, "tokens_per_second_per_gpu": 16384.53, "total_tokens": 325726420 }, { "epoch": 0.20623905976494122, "grad_norm": 0.9186805486679077, "learning_rate": 2e-05, "loss": 0.7059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3299, "tokens_per_second_per_gpu": 17062.35, "total_tokens": 325821569 }, { "epoch": 0.20630157539384847, "grad_norm": 0.8891885280609131, "learning_rate": 2e-05, "loss": 0.6886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3300, "tokens_per_second_per_gpu": 17778.97, "total_tokens": 325924018 }, { "epoch": 0.20636409102275569, "grad_norm": 0.8828868269920349, "learning_rate": 2e-05, "loss": 0.6801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3301, "tokens_per_second_per_gpu": 17749.19, "total_tokens": 326022018 }, { "epoch": 0.2064266066516629, "grad_norm": 0.8970768451690674, "learning_rate": 2e-05, "loss": 0.7139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3302, "tokens_per_second_per_gpu": 17088.5, "total_tokens": 326117647 }, { "epoch": 0.20648912228057015, "grad_norm": 0.9041146636009216, "learning_rate": 2e-05, "loss": 0.6977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3303, "tokens_per_second_per_gpu": 18041.26, "total_tokens": 326220080 }, { "epoch": 0.20655163790947736, "grad_norm": 0.887458324432373, "learning_rate": 2e-05, "loss": 0.717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3304, "tokens_per_second_per_gpu": 17013.8, "total_tokens": 326319421 }, { "epoch": 0.2066141535383846, "grad_norm": 0.882023274898529, "learning_rate": 2e-05, "loss": 0.7058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3305, "tokens_per_second_per_gpu": 17101.77, "total_tokens": 326418939 }, { "epoch": 0.20667666916729183, "grad_norm": 0.9549896121025085, "learning_rate": 2e-05, "loss": 0.7373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3306, "tokens_per_second_per_gpu": 18035.09, "total_tokens": 326519088 }, { "epoch": 0.20673918479619904, "grad_norm": 0.9051526784896851, "learning_rate": 2e-05, "loss": 0.7036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3307, "tokens_per_second_per_gpu": 17564.02, "total_tokens": 326618579 }, { "epoch": 0.2068017004251063, "grad_norm": 0.9961786270141602, "learning_rate": 2e-05, "loss": 0.6986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3308, "tokens_per_second_per_gpu": 16702.4, "total_tokens": 326716445 }, { "epoch": 0.2068642160540135, "grad_norm": 1.0118831396102905, "learning_rate": 2e-05, "loss": 0.7645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3309, "tokens_per_second_per_gpu": 17221.6, "total_tokens": 326816613 }, { "epoch": 0.20692673168292072, "grad_norm": 0.9266097545623779, "learning_rate": 2e-05, "loss": 0.6713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3310, "tokens_per_second_per_gpu": 17018.8, "total_tokens": 326914176 }, { "epoch": 0.20698924731182797, "grad_norm": 1.0039528608322144, "learning_rate": 2e-05, "loss": 0.6899, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3311, "tokens_per_second_per_gpu": 18073.94, "total_tokens": 327014286 }, { "epoch": 0.20705176294073518, "grad_norm": 0.9134907126426697, "learning_rate": 2e-05, "loss": 0.7297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3312, "tokens_per_second_per_gpu": 17621.72, "total_tokens": 327112049 }, { "epoch": 0.2071142785696424, "grad_norm": 0.9447996020317078, "learning_rate": 2e-05, "loss": 0.698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3313, "tokens_per_second_per_gpu": 16958.51, "total_tokens": 327210170 }, { "epoch": 0.20717679419854965, "grad_norm": 0.9021797180175781, "learning_rate": 2e-05, "loss": 0.7288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3314, "tokens_per_second_per_gpu": 18222.8, "total_tokens": 327315044 }, { "epoch": 0.20723930982745686, "grad_norm": 0.947604775428772, "learning_rate": 2e-05, "loss": 0.7112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3315, "tokens_per_second_per_gpu": 17854.85, "total_tokens": 327410070 }, { "epoch": 0.20730182545636408, "grad_norm": 0.9442266225814819, "learning_rate": 2e-05, "loss": 0.7562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3316, "tokens_per_second_per_gpu": 17597.54, "total_tokens": 327511605 }, { "epoch": 0.20736434108527133, "grad_norm": 0.9354389905929565, "learning_rate": 2e-05, "loss": 0.7332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3317, "tokens_per_second_per_gpu": 16575.57, "total_tokens": 327605311 }, { "epoch": 0.20742685671417854, "grad_norm": 0.8689397573471069, "learning_rate": 2e-05, "loss": 0.707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3318, "tokens_per_second_per_gpu": 18677.31, "total_tokens": 327708209 }, { "epoch": 0.20748937234308576, "grad_norm": 0.9042930603027344, "learning_rate": 2e-05, "loss": 0.7306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3319, "tokens_per_second_per_gpu": 16913.77, "total_tokens": 327808489 }, { "epoch": 0.207551887971993, "grad_norm": 0.9685072302818298, "learning_rate": 2e-05, "loss": 0.6914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3320, "tokens_per_second_per_gpu": 17619.11, "total_tokens": 327909756 }, { "epoch": 0.20761440360090022, "grad_norm": 0.9052004218101501, "learning_rate": 2e-05, "loss": 0.6612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3321, "tokens_per_second_per_gpu": 16499.77, "total_tokens": 328007889 }, { "epoch": 0.20767691922980744, "grad_norm": 0.9422147274017334, "learning_rate": 2e-05, "loss": 0.6672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3322, "tokens_per_second_per_gpu": 18214.19, "total_tokens": 328105727 }, { "epoch": 0.20773943485871468, "grad_norm": 0.8887574076652527, "learning_rate": 2e-05, "loss": 0.6872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3323, "tokens_per_second_per_gpu": 17607.64, "total_tokens": 328207867 }, { "epoch": 0.2078019504876219, "grad_norm": 0.9018566012382507, "learning_rate": 2e-05, "loss": 0.7521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3324, "tokens_per_second_per_gpu": 17892.97, "total_tokens": 328312543 }, { "epoch": 0.20786446611652915, "grad_norm": 0.9003579020500183, "learning_rate": 2e-05, "loss": 0.6889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3325, "tokens_per_second_per_gpu": 15869.35, "total_tokens": 328410821 }, { "epoch": 0.20792698174543636, "grad_norm": 0.9275116324424744, "learning_rate": 2e-05, "loss": 0.7031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3326, "tokens_per_second_per_gpu": 15785.99, "total_tokens": 328505207 }, { "epoch": 0.20798949737434358, "grad_norm": 0.925099790096283, "learning_rate": 2e-05, "loss": 0.7029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3327, "tokens_per_second_per_gpu": 16548.61, "total_tokens": 328600970 }, { "epoch": 0.20805201300325082, "grad_norm": 0.9199345707893372, "learning_rate": 2e-05, "loss": 0.7013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3328, "tokens_per_second_per_gpu": 16631.84, "total_tokens": 328698990 }, { "epoch": 0.20811452863215804, "grad_norm": 0.9010401368141174, "learning_rate": 2e-05, "loss": 0.734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3329, "tokens_per_second_per_gpu": 18280.3, "total_tokens": 328802908 }, { "epoch": 0.20817704426106526, "grad_norm": 0.9773747324943542, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3330, "tokens_per_second_per_gpu": 15441.68, "total_tokens": 328896079 }, { "epoch": 0.2082395598899725, "grad_norm": 0.9241688847541809, "learning_rate": 2e-05, "loss": 0.7036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3331, "tokens_per_second_per_gpu": 18802.51, "total_tokens": 328995352 }, { "epoch": 0.20830207551887972, "grad_norm": 0.8678909540176392, "learning_rate": 2e-05, "loss": 0.6921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3332, "tokens_per_second_per_gpu": 17652.96, "total_tokens": 329097341 }, { "epoch": 0.20836459114778694, "grad_norm": 0.9798464775085449, "learning_rate": 2e-05, "loss": 0.7264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3333, "tokens_per_second_per_gpu": 18608.14, "total_tokens": 329196928 }, { "epoch": 0.20842710677669418, "grad_norm": 0.9375390410423279, "learning_rate": 2e-05, "loss": 0.688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3334, "tokens_per_second_per_gpu": 17205.09, "total_tokens": 329294011 }, { "epoch": 0.2084896224056014, "grad_norm": 0.9771442413330078, "learning_rate": 2e-05, "loss": 0.7686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3335, "tokens_per_second_per_gpu": 17576.97, "total_tokens": 329394680 }, { "epoch": 0.20855213803450862, "grad_norm": 0.8761880397796631, "learning_rate": 2e-05, "loss": 0.7198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3336, "tokens_per_second_per_gpu": 17493.04, "total_tokens": 329497478 }, { "epoch": 0.20861465366341586, "grad_norm": 0.8760732412338257, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3337, "tokens_per_second_per_gpu": 16892.76, "total_tokens": 329599957 }, { "epoch": 0.20867716929232308, "grad_norm": 0.9276087284088135, "learning_rate": 2e-05, "loss": 0.6818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3338, "tokens_per_second_per_gpu": 17155.24, "total_tokens": 329699589 }, { "epoch": 0.2087396849212303, "grad_norm": 0.9244208335876465, "learning_rate": 2e-05, "loss": 0.668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3339, "tokens_per_second_per_gpu": 16557.12, "total_tokens": 329795150 }, { "epoch": 0.20880220055013754, "grad_norm": 0.8926609754562378, "learning_rate": 2e-05, "loss": 0.7186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3340, "tokens_per_second_per_gpu": 18692.27, "total_tokens": 329901105 }, { "epoch": 0.20886471617904476, "grad_norm": 0.8776997327804565, "learning_rate": 2e-05, "loss": 0.6889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3341, "tokens_per_second_per_gpu": 18225.95, "total_tokens": 330001272 }, { "epoch": 0.20892723180795197, "grad_norm": 0.907584011554718, "learning_rate": 2e-05, "loss": 0.7334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3342, "tokens_per_second_per_gpu": 17556.62, "total_tokens": 330099152 }, { "epoch": 0.20898974743685922, "grad_norm": 0.8704332709312439, "learning_rate": 2e-05, "loss": 0.6849, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3343, "tokens_per_second_per_gpu": 16662.55, "total_tokens": 330201055 }, { "epoch": 0.20905226306576644, "grad_norm": 0.9551135897636414, "learning_rate": 2e-05, "loss": 0.7475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3344, "tokens_per_second_per_gpu": 16720.39, "total_tokens": 330301147 }, { "epoch": 0.20911477869467368, "grad_norm": 0.9014025926589966, "learning_rate": 2e-05, "loss": 0.7171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3345, "tokens_per_second_per_gpu": 17738.67, "total_tokens": 330402736 }, { "epoch": 0.2091772943235809, "grad_norm": 0.9674652814865112, "learning_rate": 2e-05, "loss": 0.7174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3346, "tokens_per_second_per_gpu": 18054.31, "total_tokens": 330497945 }, { "epoch": 0.20923980995248811, "grad_norm": 0.9403390288352966, "learning_rate": 2e-05, "loss": 0.6748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3347, "tokens_per_second_per_gpu": 15776.7, "total_tokens": 330592514 }, { "epoch": 0.20930232558139536, "grad_norm": 0.8789780735969543, "learning_rate": 2e-05, "loss": 0.6835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3348, "tokens_per_second_per_gpu": 16975.81, "total_tokens": 330691351 }, { "epoch": 0.20936484121030258, "grad_norm": 0.9276930689811707, "learning_rate": 2e-05, "loss": 0.7107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3349, "tokens_per_second_per_gpu": 16254.28, "total_tokens": 330789583 }, { "epoch": 0.2094273568392098, "grad_norm": 0.9310871362686157, "learning_rate": 2e-05, "loss": 0.6975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3350, "tokens_per_second_per_gpu": 17223.83, "total_tokens": 330884961 }, { "epoch": 0.20948987246811704, "grad_norm": 0.8930123448371887, "learning_rate": 2e-05, "loss": 0.7358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3351, "tokens_per_second_per_gpu": 17764.91, "total_tokens": 330986892 }, { "epoch": 0.20955238809702426, "grad_norm": 0.8930652141571045, "learning_rate": 2e-05, "loss": 0.6868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3352, "tokens_per_second_per_gpu": 18121.04, "total_tokens": 331086486 }, { "epoch": 0.20961490372593147, "grad_norm": 0.9140028357505798, "learning_rate": 2e-05, "loss": 0.7318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3353, "tokens_per_second_per_gpu": 17506.81, "total_tokens": 331191292 }, { "epoch": 0.20967741935483872, "grad_norm": 0.9371730089187622, "learning_rate": 2e-05, "loss": 0.7143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3354, "tokens_per_second_per_gpu": 16089.94, "total_tokens": 331287138 }, { "epoch": 0.20973993498374593, "grad_norm": 0.954994797706604, "learning_rate": 2e-05, "loss": 0.7288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3355, "tokens_per_second_per_gpu": 17812.93, "total_tokens": 331384179 }, { "epoch": 0.20980245061265315, "grad_norm": 0.8606671690940857, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3356, "tokens_per_second_per_gpu": 17929.78, "total_tokens": 331484123 }, { "epoch": 0.2098649662415604, "grad_norm": 0.9269264340400696, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3357, "tokens_per_second_per_gpu": 16767.91, "total_tokens": 331580953 }, { "epoch": 0.2099274818704676, "grad_norm": 0.907581090927124, "learning_rate": 2e-05, "loss": 0.6765, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3358, "tokens_per_second_per_gpu": 16813.38, "total_tokens": 331679758 }, { "epoch": 0.20998999749937483, "grad_norm": 0.8663871884346008, "learning_rate": 2e-05, "loss": 0.7484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3359, "tokens_per_second_per_gpu": 18717.63, "total_tokens": 331785809 }, { "epoch": 0.21005251312828208, "grad_norm": 0.9058451056480408, "learning_rate": 2e-05, "loss": 0.7082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3360, "tokens_per_second_per_gpu": 16048.45, "total_tokens": 331883981 }, { "epoch": 0.2101150287571893, "grad_norm": 0.9018242955207825, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3361, "tokens_per_second_per_gpu": 16654.43, "total_tokens": 331974965 }, { "epoch": 0.21017754438609654, "grad_norm": 0.9383875727653503, "learning_rate": 2e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3362, "tokens_per_second_per_gpu": 16837.26, "total_tokens": 332074266 }, { "epoch": 0.21024006001500375, "grad_norm": 0.9007526636123657, "learning_rate": 2e-05, "loss": 0.7059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3363, "tokens_per_second_per_gpu": 17541.75, "total_tokens": 332175110 }, { "epoch": 0.21030257564391097, "grad_norm": 0.938340961933136, "learning_rate": 2e-05, "loss": 0.7077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3364, "tokens_per_second_per_gpu": 17148.15, "total_tokens": 332273741 }, { "epoch": 0.21036509127281822, "grad_norm": 0.9282808303833008, "learning_rate": 2e-05, "loss": 0.7068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3365, "tokens_per_second_per_gpu": 17760.47, "total_tokens": 332374870 }, { "epoch": 0.21042760690172543, "grad_norm": 0.9039899110794067, "learning_rate": 2e-05, "loss": 0.7044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3366, "tokens_per_second_per_gpu": 17603.81, "total_tokens": 332476964 }, { "epoch": 0.21049012253063265, "grad_norm": 0.933885395526886, "learning_rate": 2e-05, "loss": 0.7541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3367, "tokens_per_second_per_gpu": 18501.22, "total_tokens": 332577948 }, { "epoch": 0.2105526381595399, "grad_norm": 0.9394276142120361, "learning_rate": 2e-05, "loss": 0.8101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3368, "tokens_per_second_per_gpu": 18225.26, "total_tokens": 332680260 }, { "epoch": 0.2106151537884471, "grad_norm": 0.913177490234375, "learning_rate": 2e-05, "loss": 0.6538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3369, "tokens_per_second_per_gpu": 16092.14, "total_tokens": 332773008 }, { "epoch": 0.21067766941735433, "grad_norm": 0.92917799949646, "learning_rate": 2e-05, "loss": 0.7197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3370, "tokens_per_second_per_gpu": 16913.74, "total_tokens": 332871605 }, { "epoch": 0.21074018504626157, "grad_norm": 0.8997038006782532, "learning_rate": 2e-05, "loss": 0.6679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3371, "tokens_per_second_per_gpu": 16657.57, "total_tokens": 332966719 }, { "epoch": 0.2108027006751688, "grad_norm": 0.9302802681922913, "learning_rate": 2e-05, "loss": 0.71, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3372, "tokens_per_second_per_gpu": 15992.39, "total_tokens": 333065448 }, { "epoch": 0.210865216304076, "grad_norm": 0.930272102355957, "learning_rate": 2e-05, "loss": 0.7141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3373, "tokens_per_second_per_gpu": 15928.46, "total_tokens": 333163089 }, { "epoch": 0.21092773193298325, "grad_norm": 0.9405153393745422, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3374, "tokens_per_second_per_gpu": 16985.16, "total_tokens": 333260958 }, { "epoch": 0.21099024756189047, "grad_norm": 0.9125068187713623, "learning_rate": 2e-05, "loss": 0.6961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3375, "tokens_per_second_per_gpu": 17806.09, "total_tokens": 333359707 }, { "epoch": 0.2110527631907977, "grad_norm": 0.9001886248588562, "learning_rate": 2e-05, "loss": 0.699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3376, "tokens_per_second_per_gpu": 16076.35, "total_tokens": 333457895 }, { "epoch": 0.21111527881970493, "grad_norm": 0.912761390209198, "learning_rate": 2e-05, "loss": 0.7072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3377, "tokens_per_second_per_gpu": 17847.43, "total_tokens": 333558232 }, { "epoch": 0.21117779444861215, "grad_norm": 0.9603567123413086, "learning_rate": 2e-05, "loss": 0.6946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3378, "tokens_per_second_per_gpu": 17084.32, "total_tokens": 333655442 }, { "epoch": 0.21124031007751937, "grad_norm": 0.8812183737754822, "learning_rate": 2e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3379, "tokens_per_second_per_gpu": 16901.4, "total_tokens": 333754028 }, { "epoch": 0.2113028257064266, "grad_norm": 0.9078694581985474, "learning_rate": 2e-05, "loss": 0.7074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3380, "tokens_per_second_per_gpu": 16926.03, "total_tokens": 333852083 }, { "epoch": 0.21136534133533383, "grad_norm": 0.9326125383377075, "learning_rate": 2e-05, "loss": 0.7407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3381, "tokens_per_second_per_gpu": 18420.02, "total_tokens": 333950099 }, { "epoch": 0.21142785696424107, "grad_norm": 0.9315022230148315, "learning_rate": 2e-05, "loss": 0.6766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3382, "tokens_per_second_per_gpu": 17232.95, "total_tokens": 334050238 }, { "epoch": 0.2114903725931483, "grad_norm": 0.9285011887550354, "learning_rate": 2e-05, "loss": 0.7279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3383, "tokens_per_second_per_gpu": 17781.26, "total_tokens": 334151441 }, { "epoch": 0.2115528882220555, "grad_norm": 0.9101738929748535, "learning_rate": 2e-05, "loss": 0.7054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3384, "tokens_per_second_per_gpu": 17124.4, "total_tokens": 334250139 }, { "epoch": 0.21161540385096275, "grad_norm": 0.8761592507362366, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3385, "tokens_per_second_per_gpu": 17718.24, "total_tokens": 334349365 }, { "epoch": 0.21167791947986997, "grad_norm": 0.9006868600845337, "learning_rate": 2e-05, "loss": 0.7149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3386, "tokens_per_second_per_gpu": 17091.83, "total_tokens": 334448105 }, { "epoch": 0.2117404351087772, "grad_norm": 0.9099477529525757, "learning_rate": 2e-05, "loss": 0.7141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3387, "tokens_per_second_per_gpu": 17471.4, "total_tokens": 334548523 }, { "epoch": 0.21180295073768443, "grad_norm": 0.9421185255050659, "learning_rate": 2e-05, "loss": 0.7231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3388, "tokens_per_second_per_gpu": 18503.05, "total_tokens": 334648811 }, { "epoch": 0.21186546636659165, "grad_norm": 0.953018069267273, "learning_rate": 2e-05, "loss": 0.7122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3389, "tokens_per_second_per_gpu": 16952.15, "total_tokens": 334748855 }, { "epoch": 0.21192798199549887, "grad_norm": 0.9477548599243164, "learning_rate": 2e-05, "loss": 0.7292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3390, "tokens_per_second_per_gpu": 17195.17, "total_tokens": 334850819 }, { "epoch": 0.2119904976244061, "grad_norm": 0.9236243367195129, "learning_rate": 2e-05, "loss": 0.7088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3391, "tokens_per_second_per_gpu": 18545.17, "total_tokens": 334953967 }, { "epoch": 0.21205301325331333, "grad_norm": 0.9282516837120056, "learning_rate": 2e-05, "loss": 0.7146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3392, "tokens_per_second_per_gpu": 17315.39, "total_tokens": 335049889 }, { "epoch": 0.21211552888222054, "grad_norm": 0.8812220096588135, "learning_rate": 2e-05, "loss": 0.7382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3393, "tokens_per_second_per_gpu": 17276.82, "total_tokens": 335150689 }, { "epoch": 0.2121780445111278, "grad_norm": 0.8778788447380066, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3394, "tokens_per_second_per_gpu": 16746.98, "total_tokens": 335248980 }, { "epoch": 0.212240560140035, "grad_norm": 0.9458943009376526, "learning_rate": 2e-05, "loss": 0.7219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3395, "tokens_per_second_per_gpu": 16290.84, "total_tokens": 335343753 }, { "epoch": 0.21230307576894222, "grad_norm": 0.9266532063484192, "learning_rate": 2e-05, "loss": 0.7122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3396, "tokens_per_second_per_gpu": 17761.24, "total_tokens": 335447683 }, { "epoch": 0.21236559139784947, "grad_norm": 0.9251901507377625, "learning_rate": 2e-05, "loss": 0.7172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3397, "tokens_per_second_per_gpu": 17691.17, "total_tokens": 335547448 }, { "epoch": 0.21242810702675669, "grad_norm": 0.8710814118385315, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3398, "tokens_per_second_per_gpu": 16870.4, "total_tokens": 335647260 }, { "epoch": 0.2124906226556639, "grad_norm": 0.9767580032348633, "learning_rate": 2e-05, "loss": 0.7362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3399, "tokens_per_second_per_gpu": 17850.01, "total_tokens": 335747269 }, { "epoch": 0.21255313828457115, "grad_norm": 0.899987518787384, "learning_rate": 2e-05, "loss": 0.684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3400, "tokens_per_second_per_gpu": 17417.06, "total_tokens": 335847765 }, { "epoch": 0.21261565391347836, "grad_norm": 0.952929675579071, "learning_rate": 2e-05, "loss": 0.6628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3401, "tokens_per_second_per_gpu": 16723.49, "total_tokens": 335944286 }, { "epoch": 0.2126781695423856, "grad_norm": 0.9355248212814331, "learning_rate": 2e-05, "loss": 0.7112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3402, "tokens_per_second_per_gpu": 17767.25, "total_tokens": 336040804 }, { "epoch": 0.21274068517129283, "grad_norm": 0.9536468982696533, "learning_rate": 2e-05, "loss": 0.7088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3403, "tokens_per_second_per_gpu": 19155.75, "total_tokens": 336144877 }, { "epoch": 0.21280320080020004, "grad_norm": 0.9527645111083984, "learning_rate": 2e-05, "loss": 0.7282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3404, "tokens_per_second_per_gpu": 17196.22, "total_tokens": 336246181 }, { "epoch": 0.2128657164291073, "grad_norm": 0.9297769069671631, "learning_rate": 2e-05, "loss": 0.6883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3405, "tokens_per_second_per_gpu": 16886.89, "total_tokens": 336342602 }, { "epoch": 0.2129282320580145, "grad_norm": 0.8984578847885132, "learning_rate": 2e-05, "loss": 0.6817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3406, "tokens_per_second_per_gpu": 17680.34, "total_tokens": 336441240 }, { "epoch": 0.21299074768692172, "grad_norm": 0.8970364332199097, "learning_rate": 2e-05, "loss": 0.7105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3407, "tokens_per_second_per_gpu": 17682.06, "total_tokens": 336543174 }, { "epoch": 0.21305326331582897, "grad_norm": 0.9370589256286621, "learning_rate": 2e-05, "loss": 0.729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3408, "tokens_per_second_per_gpu": 17232.12, "total_tokens": 336642608 }, { "epoch": 0.21311577894473618, "grad_norm": 0.9426548480987549, "learning_rate": 2e-05, "loss": 0.7159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3409, "tokens_per_second_per_gpu": 16342.17, "total_tokens": 336740893 }, { "epoch": 0.2131782945736434, "grad_norm": 0.9296088814735413, "learning_rate": 2e-05, "loss": 0.7172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3410, "tokens_per_second_per_gpu": 17975.38, "total_tokens": 336841929 }, { "epoch": 0.21324081020255065, "grad_norm": 0.9475957155227661, "learning_rate": 2e-05, "loss": 0.7199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3411, "tokens_per_second_per_gpu": 18326.66, "total_tokens": 336942559 }, { "epoch": 0.21330332583145786, "grad_norm": 0.9098148345947266, "learning_rate": 2e-05, "loss": 0.6979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3412, "tokens_per_second_per_gpu": 17029.09, "total_tokens": 337035272 }, { "epoch": 0.21336584146036508, "grad_norm": 0.9111841320991516, "learning_rate": 2e-05, "loss": 0.7088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3413, "tokens_per_second_per_gpu": 16891.75, "total_tokens": 337131536 }, { "epoch": 0.21342835708927232, "grad_norm": 0.968588650226593, "learning_rate": 2e-05, "loss": 0.6646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3414, "tokens_per_second_per_gpu": 15694.14, "total_tokens": 337228273 }, { "epoch": 0.21349087271817954, "grad_norm": 0.9611263275146484, "learning_rate": 2e-05, "loss": 0.6994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3415, "tokens_per_second_per_gpu": 17069.75, "total_tokens": 337323662 }, { "epoch": 0.21355338834708676, "grad_norm": 0.9214547872543335, "learning_rate": 2e-05, "loss": 0.7448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3416, "tokens_per_second_per_gpu": 17849.92, "total_tokens": 337424382 }, { "epoch": 0.213615903975994, "grad_norm": 0.8801717758178711, "learning_rate": 2e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3417, "tokens_per_second_per_gpu": 16868.91, "total_tokens": 337523156 }, { "epoch": 0.21367841960490122, "grad_norm": 0.9326563477516174, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3418, "tokens_per_second_per_gpu": 16233.49, "total_tokens": 337618052 }, { "epoch": 0.21374093523380847, "grad_norm": 1.0038191080093384, "learning_rate": 2e-05, "loss": 0.712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3419, "tokens_per_second_per_gpu": 17367.64, "total_tokens": 337720625 }, { "epoch": 0.21380345086271568, "grad_norm": 0.9857380986213684, "learning_rate": 2e-05, "loss": 0.7035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3420, "tokens_per_second_per_gpu": 17820.67, "total_tokens": 337824652 }, { "epoch": 0.2138659664916229, "grad_norm": 0.8660918474197388, "learning_rate": 2e-05, "loss": 0.6826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3421, "tokens_per_second_per_gpu": 17427.54, "total_tokens": 337925197 }, { "epoch": 0.21392848212053014, "grad_norm": 0.9455657601356506, "learning_rate": 2e-05, "loss": 0.7099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3422, "tokens_per_second_per_gpu": 17438.91, "total_tokens": 338023901 }, { "epoch": 0.21399099774943736, "grad_norm": 0.964224636554718, "learning_rate": 2e-05, "loss": 0.7258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3423, "tokens_per_second_per_gpu": 17896.87, "total_tokens": 338125311 }, { "epoch": 0.21405351337834458, "grad_norm": 0.9387924671173096, "learning_rate": 2e-05, "loss": 0.7114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3424, "tokens_per_second_per_gpu": 17991.19, "total_tokens": 338225318 }, { "epoch": 0.21411602900725182, "grad_norm": 0.8878474235534668, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3425, "tokens_per_second_per_gpu": 16147.44, "total_tokens": 338318034 }, { "epoch": 0.21417854463615904, "grad_norm": 0.9243015050888062, "learning_rate": 2e-05, "loss": 0.6824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3426, "tokens_per_second_per_gpu": 16124.78, "total_tokens": 338413394 }, { "epoch": 0.21424106026506626, "grad_norm": 0.9197579622268677, "learning_rate": 2e-05, "loss": 0.6994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3427, "tokens_per_second_per_gpu": 16470.84, "total_tokens": 338510114 }, { "epoch": 0.2143035758939735, "grad_norm": 0.9245789647102356, "learning_rate": 2e-05, "loss": 0.7165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3428, "tokens_per_second_per_gpu": 17645.55, "total_tokens": 338610599 }, { "epoch": 0.21436609152288072, "grad_norm": 0.9117740988731384, "learning_rate": 2e-05, "loss": 0.7195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3429, "tokens_per_second_per_gpu": 17194.57, "total_tokens": 338709147 }, { "epoch": 0.21442860715178794, "grad_norm": 0.9069274067878723, "learning_rate": 2e-05, "loss": 0.7128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3430, "tokens_per_second_per_gpu": 17371.44, "total_tokens": 338809205 }, { "epoch": 0.21449112278069518, "grad_norm": 0.892968475818634, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3431, "tokens_per_second_per_gpu": 17815.42, "total_tokens": 338910215 }, { "epoch": 0.2145536384096024, "grad_norm": 0.9090484976768494, "learning_rate": 2e-05, "loss": 0.7268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3432, "tokens_per_second_per_gpu": 18236.2, "total_tokens": 339012081 }, { "epoch": 0.21461615403850962, "grad_norm": 0.9193006157875061, "learning_rate": 2e-05, "loss": 0.6883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3433, "tokens_per_second_per_gpu": 16968.45, "total_tokens": 339109988 }, { "epoch": 0.21467866966741686, "grad_norm": 0.9282892346382141, "learning_rate": 2e-05, "loss": 0.7361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3434, "tokens_per_second_per_gpu": 17529.36, "total_tokens": 339211580 }, { "epoch": 0.21474118529632408, "grad_norm": 0.8950605392456055, "learning_rate": 2e-05, "loss": 0.6955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3435, "tokens_per_second_per_gpu": 17588.08, "total_tokens": 339309273 }, { "epoch": 0.2148037009252313, "grad_norm": 0.8904788494110107, "learning_rate": 2e-05, "loss": 0.6767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3436, "tokens_per_second_per_gpu": 17376.12, "total_tokens": 339410964 }, { "epoch": 0.21486621655413854, "grad_norm": 0.9284491539001465, "learning_rate": 2e-05, "loss": 0.7347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3437, "tokens_per_second_per_gpu": 16496.69, "total_tokens": 339505767 }, { "epoch": 0.21492873218304576, "grad_norm": 0.8996725082397461, "learning_rate": 2e-05, "loss": 0.726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3438, "tokens_per_second_per_gpu": 17660.2, "total_tokens": 339606989 }, { "epoch": 0.214991247811953, "grad_norm": 0.9335187077522278, "learning_rate": 2e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3439, "tokens_per_second_per_gpu": 18119.4, "total_tokens": 339708338 }, { "epoch": 0.21505376344086022, "grad_norm": 0.9383832812309265, "learning_rate": 2e-05, "loss": 0.6862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3440, "tokens_per_second_per_gpu": 16285.46, "total_tokens": 339803855 }, { "epoch": 0.21511627906976744, "grad_norm": 0.920080304145813, "learning_rate": 2e-05, "loss": 0.7377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3441, "tokens_per_second_per_gpu": 17651.68, "total_tokens": 339903375 }, { "epoch": 0.21517879469867468, "grad_norm": 0.9423712491989136, "learning_rate": 2e-05, "loss": 0.6735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3442, "tokens_per_second_per_gpu": 17302.69, "total_tokens": 340002675 }, { "epoch": 0.2152413103275819, "grad_norm": 0.9665266275405884, "learning_rate": 2e-05, "loss": 0.7143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3443, "tokens_per_second_per_gpu": 17371.98, "total_tokens": 340100513 }, { "epoch": 0.21530382595648911, "grad_norm": 0.8696658611297607, "learning_rate": 2e-05, "loss": 0.6905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3444, "tokens_per_second_per_gpu": 17053.48, "total_tokens": 340200082 }, { "epoch": 0.21536634158539636, "grad_norm": 0.8659653067588806, "learning_rate": 2e-05, "loss": 0.699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3445, "tokens_per_second_per_gpu": 17834.62, "total_tokens": 340303604 }, { "epoch": 0.21542885721430358, "grad_norm": 0.8954638838768005, "learning_rate": 2e-05, "loss": 0.7241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3446, "tokens_per_second_per_gpu": 17015.42, "total_tokens": 340404639 }, { "epoch": 0.2154913728432108, "grad_norm": 0.9229937195777893, "learning_rate": 2e-05, "loss": 0.7399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3447, "tokens_per_second_per_gpu": 16992.24, "total_tokens": 340503253 }, { "epoch": 0.21555388847211804, "grad_norm": 0.888698160648346, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3448, "tokens_per_second_per_gpu": 17139.04, "total_tokens": 340601698 }, { "epoch": 0.21561640410102526, "grad_norm": 0.882743775844574, "learning_rate": 2e-05, "loss": 0.6654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3449, "tokens_per_second_per_gpu": 18685.63, "total_tokens": 340702098 }, { "epoch": 0.21567891972993247, "grad_norm": 0.9331004619598389, "learning_rate": 2e-05, "loss": 0.6933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3450, "tokens_per_second_per_gpu": 16975.65, "total_tokens": 340801342 }, { "epoch": 0.21574143535883972, "grad_norm": 0.8686053156852722, "learning_rate": 2e-05, "loss": 0.7134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3451, "tokens_per_second_per_gpu": 17212.68, "total_tokens": 340901547 }, { "epoch": 0.21580395098774693, "grad_norm": 0.9118778705596924, "learning_rate": 2e-05, "loss": 0.6769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3452, "tokens_per_second_per_gpu": 17754.5, "total_tokens": 340999585 }, { "epoch": 0.21586646661665415, "grad_norm": 0.8904048204421997, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3453, "tokens_per_second_per_gpu": 18350.99, "total_tokens": 341099775 }, { "epoch": 0.2159289822455614, "grad_norm": 0.8729327321052551, "learning_rate": 2e-05, "loss": 0.7354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3454, "tokens_per_second_per_gpu": 18493.87, "total_tokens": 341208470 }, { "epoch": 0.2159914978744686, "grad_norm": 0.9115880131721497, "learning_rate": 2e-05, "loss": 0.6735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3455, "tokens_per_second_per_gpu": 16944.62, "total_tokens": 341303511 }, { "epoch": 0.21605401350337583, "grad_norm": 0.9032296538352966, "learning_rate": 2e-05, "loss": 0.6951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3456, "tokens_per_second_per_gpu": 17396.98, "total_tokens": 341399499 }, { "epoch": 0.21611652913228308, "grad_norm": 0.9236092567443848, "learning_rate": 2e-05, "loss": 0.7044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3457, "tokens_per_second_per_gpu": 17691.18, "total_tokens": 341496319 }, { "epoch": 0.2161790447611903, "grad_norm": 0.9370045065879822, "learning_rate": 2e-05, "loss": 0.7326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3458, "tokens_per_second_per_gpu": 18200.23, "total_tokens": 341600083 }, { "epoch": 0.21624156039009754, "grad_norm": 0.9532602429389954, "learning_rate": 2e-05, "loss": 0.7029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3459, "tokens_per_second_per_gpu": 16469.0, "total_tokens": 341698434 }, { "epoch": 0.21630407601900475, "grad_norm": 0.9108281135559082, "learning_rate": 2e-05, "loss": 0.7008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3460, "tokens_per_second_per_gpu": 17487.65, "total_tokens": 341795998 }, { "epoch": 0.21636659164791197, "grad_norm": 0.878275454044342, "learning_rate": 2e-05, "loss": 0.701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3461, "tokens_per_second_per_gpu": 17629.76, "total_tokens": 341894086 }, { "epoch": 0.21642910727681922, "grad_norm": 0.8776189088821411, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3462, "tokens_per_second_per_gpu": 15836.26, "total_tokens": 341988880 }, { "epoch": 0.21649162290572643, "grad_norm": 0.9079241752624512, "learning_rate": 2e-05, "loss": 0.7185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3463, "tokens_per_second_per_gpu": 16768.64, "total_tokens": 342087795 }, { "epoch": 0.21655413853463365, "grad_norm": 0.920820415019989, "learning_rate": 2e-05, "loss": 0.7574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3464, "tokens_per_second_per_gpu": 18468.8, "total_tokens": 342191012 }, { "epoch": 0.2166166541635409, "grad_norm": 0.9517201781272888, "learning_rate": 2e-05, "loss": 0.7353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3465, "tokens_per_second_per_gpu": 17553.34, "total_tokens": 342293515 }, { "epoch": 0.2166791697924481, "grad_norm": 0.9018070697784424, "learning_rate": 2e-05, "loss": 0.714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3466, "tokens_per_second_per_gpu": 17347.14, "total_tokens": 342391773 }, { "epoch": 0.21674168542135533, "grad_norm": 0.9044578671455383, "learning_rate": 2e-05, "loss": 0.6781, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3467, "tokens_per_second_per_gpu": 18098.84, "total_tokens": 342490871 }, { "epoch": 0.21680420105026257, "grad_norm": 0.9073135852813721, "learning_rate": 2e-05, "loss": 0.6839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3468, "tokens_per_second_per_gpu": 17151.48, "total_tokens": 342590843 }, { "epoch": 0.2168667166791698, "grad_norm": 0.9175869822502136, "learning_rate": 2e-05, "loss": 0.686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3469, "tokens_per_second_per_gpu": 17731.83, "total_tokens": 342688532 }, { "epoch": 0.216929232308077, "grad_norm": 0.9255704879760742, "learning_rate": 2e-05, "loss": 0.7154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3470, "tokens_per_second_per_gpu": 18809.55, "total_tokens": 342790754 }, { "epoch": 0.21699174793698425, "grad_norm": 0.913276731967926, "learning_rate": 2e-05, "loss": 0.7129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3471, "tokens_per_second_per_gpu": 17048.36, "total_tokens": 342887234 }, { "epoch": 0.21705426356589147, "grad_norm": 0.9145263433456421, "learning_rate": 2e-05, "loss": 0.7558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3472, "tokens_per_second_per_gpu": 18134.69, "total_tokens": 342988087 }, { "epoch": 0.2171167791947987, "grad_norm": 0.9093050956726074, "learning_rate": 2e-05, "loss": 0.6495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3473, "tokens_per_second_per_gpu": 16324.07, "total_tokens": 343079965 }, { "epoch": 0.21717929482370593, "grad_norm": 0.9196884036064148, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3474, "tokens_per_second_per_gpu": 16736.58, "total_tokens": 343174022 }, { "epoch": 0.21724181045261315, "grad_norm": 0.9273703694343567, "learning_rate": 2e-05, "loss": 0.7008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3475, "tokens_per_second_per_gpu": 17518.12, "total_tokens": 343272498 }, { "epoch": 0.2173043260815204, "grad_norm": 0.8835545778274536, "learning_rate": 2e-05, "loss": 0.6953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3476, "tokens_per_second_per_gpu": 18059.62, "total_tokens": 343371839 }, { "epoch": 0.2173668417104276, "grad_norm": 0.943320631980896, "learning_rate": 2e-05, "loss": 0.7078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3477, "tokens_per_second_per_gpu": 16973.36, "total_tokens": 343468081 }, { "epoch": 0.21742935733933483, "grad_norm": 0.9136156439781189, "learning_rate": 2e-05, "loss": 0.7005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3478, "tokens_per_second_per_gpu": 17489.78, "total_tokens": 343568289 }, { "epoch": 0.21749187296824207, "grad_norm": 0.8799566030502319, "learning_rate": 2e-05, "loss": 0.7093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3479, "tokens_per_second_per_gpu": 17863.62, "total_tokens": 343668826 }, { "epoch": 0.2175543885971493, "grad_norm": 0.9149079322814941, "learning_rate": 2e-05, "loss": 0.7155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3480, "tokens_per_second_per_gpu": 18108.1, "total_tokens": 343767199 }, { "epoch": 0.2176169042260565, "grad_norm": 0.9752122759819031, "learning_rate": 2e-05, "loss": 0.7682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3481, "tokens_per_second_per_gpu": 16690.32, "total_tokens": 343862428 }, { "epoch": 0.21767941985496375, "grad_norm": 0.9387710094451904, "learning_rate": 2e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3482, "tokens_per_second_per_gpu": 15291.45, "total_tokens": 343956494 }, { "epoch": 0.21774193548387097, "grad_norm": 0.8953869342803955, "learning_rate": 2e-05, "loss": 0.6721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3483, "tokens_per_second_per_gpu": 16542.6, "total_tokens": 344052183 }, { "epoch": 0.21780445111277819, "grad_norm": 0.8513835668563843, "learning_rate": 2e-05, "loss": 0.6878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3484, "tokens_per_second_per_gpu": 18330.67, "total_tokens": 344152928 }, { "epoch": 0.21786696674168543, "grad_norm": 0.8746731877326965, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3485, "tokens_per_second_per_gpu": 17625.43, "total_tokens": 344253433 }, { "epoch": 0.21792948237059265, "grad_norm": 0.8995851874351501, "learning_rate": 2e-05, "loss": 0.7101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3486, "tokens_per_second_per_gpu": 17223.94, "total_tokens": 344352844 }, { "epoch": 0.21799199799949986, "grad_norm": 0.897289514541626, "learning_rate": 2e-05, "loss": 0.7241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3487, "tokens_per_second_per_gpu": 18006.4, "total_tokens": 344452814 }, { "epoch": 0.2180545136284071, "grad_norm": 0.8826555609703064, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3488, "tokens_per_second_per_gpu": 17332.93, "total_tokens": 344551844 }, { "epoch": 0.21811702925731433, "grad_norm": 0.9654474854469299, "learning_rate": 2e-05, "loss": 0.7536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3489, "tokens_per_second_per_gpu": 16873.9, "total_tokens": 344647184 }, { "epoch": 0.21817954488622154, "grad_norm": 0.8577169179916382, "learning_rate": 2e-05, "loss": 0.7303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3490, "tokens_per_second_per_gpu": 18820.43, "total_tokens": 344753698 }, { "epoch": 0.2182420605151288, "grad_norm": 0.9104345440864563, "learning_rate": 2e-05, "loss": 0.7239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3491, "tokens_per_second_per_gpu": 18486.56, "total_tokens": 344857622 }, { "epoch": 0.218304576144036, "grad_norm": 0.9041884541511536, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3492, "tokens_per_second_per_gpu": 15195.58, "total_tokens": 344950969 }, { "epoch": 0.21836709177294322, "grad_norm": 0.915936291217804, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3493, "tokens_per_second_per_gpu": 17029.37, "total_tokens": 345047631 }, { "epoch": 0.21842960740185047, "grad_norm": 0.9262682795524597, "learning_rate": 2e-05, "loss": 0.7474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3494, "tokens_per_second_per_gpu": 17933.99, "total_tokens": 345147127 }, { "epoch": 0.21849212303075768, "grad_norm": 0.9174570441246033, "learning_rate": 2e-05, "loss": 0.6805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3495, "tokens_per_second_per_gpu": 18164.22, "total_tokens": 345247144 }, { "epoch": 0.21855463865966493, "grad_norm": 0.8981844782829285, "learning_rate": 2e-05, "loss": 0.6981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3496, "tokens_per_second_per_gpu": 17778.25, "total_tokens": 345347155 }, { "epoch": 0.21861715428857215, "grad_norm": 0.8725413680076599, "learning_rate": 2e-05, "loss": 0.7104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3497, "tokens_per_second_per_gpu": 18778.01, "total_tokens": 345448542 }, { "epoch": 0.21867966991747936, "grad_norm": 0.9283565878868103, "learning_rate": 2e-05, "loss": 0.6962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3498, "tokens_per_second_per_gpu": 17259.54, "total_tokens": 345542601 }, { "epoch": 0.2187421855463866, "grad_norm": 1.0018820762634277, "learning_rate": 2e-05, "loss": 0.6844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3499, "tokens_per_second_per_gpu": 16050.0, "total_tokens": 345638287 }, { "epoch": 0.21880470117529383, "grad_norm": 0.8475329279899597, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3500, "tokens_per_second_per_gpu": 17421.91, "total_tokens": 345739205 }, { "epoch": 0.21886721680420104, "grad_norm": 0.8983535766601562, "learning_rate": 2e-05, "loss": 0.7129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3501, "tokens_per_second_per_gpu": 18480.63, "total_tokens": 345842631 }, { "epoch": 0.2189297324331083, "grad_norm": 0.912635087966919, "learning_rate": 2e-05, "loss": 0.6918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3502, "tokens_per_second_per_gpu": 16940.61, "total_tokens": 345942142 }, { "epoch": 0.2189922480620155, "grad_norm": 0.8953595757484436, "learning_rate": 2e-05, "loss": 0.7461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3503, "tokens_per_second_per_gpu": 18906.16, "total_tokens": 346042832 }, { "epoch": 0.21905476369092272, "grad_norm": 0.8977735638618469, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3504, "tokens_per_second_per_gpu": 17767.09, "total_tokens": 346141499 }, { "epoch": 0.21911727931982997, "grad_norm": 0.9347326159477234, "learning_rate": 2e-05, "loss": 0.7195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3505, "tokens_per_second_per_gpu": 16946.89, "total_tokens": 346239272 }, { "epoch": 0.21917979494873718, "grad_norm": 0.9182495474815369, "learning_rate": 2e-05, "loss": 0.7484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3506, "tokens_per_second_per_gpu": 17160.69, "total_tokens": 346339086 }, { "epoch": 0.2192423105776444, "grad_norm": 0.8725362420082092, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3507, "tokens_per_second_per_gpu": 16851.65, "total_tokens": 346436652 }, { "epoch": 0.21930482620655165, "grad_norm": 0.9150961637496948, "learning_rate": 2e-05, "loss": 0.7512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3508, "tokens_per_second_per_gpu": 18375.19, "total_tokens": 346539254 }, { "epoch": 0.21936734183545886, "grad_norm": 0.8939502835273743, "learning_rate": 2e-05, "loss": 0.7104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3509, "tokens_per_second_per_gpu": 18092.44, "total_tokens": 346637891 }, { "epoch": 0.21942985746436608, "grad_norm": 0.878404974937439, "learning_rate": 2e-05, "loss": 0.6978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3510, "tokens_per_second_per_gpu": 17405.86, "total_tokens": 346739456 }, { "epoch": 0.21949237309327332, "grad_norm": 0.8918633460998535, "learning_rate": 2e-05, "loss": 0.6979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3511, "tokens_per_second_per_gpu": 17115.87, "total_tokens": 346838960 }, { "epoch": 0.21955488872218054, "grad_norm": 0.882152259349823, "learning_rate": 2e-05, "loss": 0.7158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3512, "tokens_per_second_per_gpu": 17160.49, "total_tokens": 346938495 }, { "epoch": 0.21961740435108776, "grad_norm": 0.9132250547409058, "learning_rate": 2e-05, "loss": 0.7496, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3513, "tokens_per_second_per_gpu": 18434.53, "total_tokens": 347044007 }, { "epoch": 0.219679919979995, "grad_norm": 0.9112141728401184, "learning_rate": 2e-05, "loss": 0.728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3514, "tokens_per_second_per_gpu": 18094.85, "total_tokens": 347145900 }, { "epoch": 0.21974243560890222, "grad_norm": 0.9126465916633606, "learning_rate": 2e-05, "loss": 0.7152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3515, "tokens_per_second_per_gpu": 17336.31, "total_tokens": 347244827 }, { "epoch": 0.21980495123780946, "grad_norm": 0.9468284249305725, "learning_rate": 2e-05, "loss": 0.7111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3516, "tokens_per_second_per_gpu": 17636.68, "total_tokens": 347343212 }, { "epoch": 0.21986746686671668, "grad_norm": 0.9396647810935974, "learning_rate": 2e-05, "loss": 0.7226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3517, "tokens_per_second_per_gpu": 16261.98, "total_tokens": 347436703 }, { "epoch": 0.2199299824956239, "grad_norm": 0.8933379650115967, "learning_rate": 2e-05, "loss": 0.7032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3518, "tokens_per_second_per_gpu": 17873.52, "total_tokens": 347535271 }, { "epoch": 0.21999249812453114, "grad_norm": 0.9036476612091064, "learning_rate": 2e-05, "loss": 0.6894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3519, "tokens_per_second_per_gpu": 16721.81, "total_tokens": 347630811 }, { "epoch": 0.22005501375343836, "grad_norm": 0.8885653018951416, "learning_rate": 2e-05, "loss": 0.7113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3520, "tokens_per_second_per_gpu": 17736.33, "total_tokens": 347734242 }, { "epoch": 0.22011752938234558, "grad_norm": 0.9272810816764832, "learning_rate": 2e-05, "loss": 0.7017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3521, "tokens_per_second_per_gpu": 18525.07, "total_tokens": 347833070 }, { "epoch": 0.22018004501125282, "grad_norm": 0.9116895198822021, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3522, "tokens_per_second_per_gpu": 16624.02, "total_tokens": 347927684 }, { "epoch": 0.22024256064016004, "grad_norm": 0.9193771481513977, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3523, "tokens_per_second_per_gpu": 15063.1, "total_tokens": 348018993 }, { "epoch": 0.22030507626906726, "grad_norm": 0.8995683789253235, "learning_rate": 2e-05, "loss": 0.719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3524, "tokens_per_second_per_gpu": 17723.62, "total_tokens": 348119645 }, { "epoch": 0.2203675918979745, "grad_norm": 0.9038082957267761, "learning_rate": 2e-05, "loss": 0.6894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3525, "tokens_per_second_per_gpu": 15797.12, "total_tokens": 348219562 }, { "epoch": 0.22043010752688172, "grad_norm": 0.8971043229103088, "learning_rate": 2e-05, "loss": 0.6628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3526, "tokens_per_second_per_gpu": 16647.0, "total_tokens": 348315880 }, { "epoch": 0.22049262315578894, "grad_norm": 0.9360092878341675, "learning_rate": 2e-05, "loss": 0.7216, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3527, "tokens_per_second_per_gpu": 17862.56, "total_tokens": 348420830 }, { "epoch": 0.22055513878469618, "grad_norm": 0.9040787220001221, "learning_rate": 2e-05, "loss": 0.7236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3528, "tokens_per_second_per_gpu": 16460.61, "total_tokens": 348515955 }, { "epoch": 0.2206176544136034, "grad_norm": 0.9116660356521606, "learning_rate": 2e-05, "loss": 0.741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3529, "tokens_per_second_per_gpu": 17636.07, "total_tokens": 348614996 }, { "epoch": 0.22068017004251061, "grad_norm": 0.9313174486160278, "learning_rate": 2e-05, "loss": 0.6967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3530, "tokens_per_second_per_gpu": 17120.51, "total_tokens": 348712247 }, { "epoch": 0.22074268567141786, "grad_norm": 0.9095075726509094, "learning_rate": 2e-05, "loss": 0.7285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3531, "tokens_per_second_per_gpu": 17988.01, "total_tokens": 348815012 }, { "epoch": 0.22080520130032508, "grad_norm": 0.9129692316055298, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3532, "tokens_per_second_per_gpu": 16346.59, "total_tokens": 348908432 }, { "epoch": 0.22086771692923232, "grad_norm": 0.9138655066490173, "learning_rate": 2e-05, "loss": 0.7128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3533, "tokens_per_second_per_gpu": 17702.49, "total_tokens": 349006562 }, { "epoch": 0.22093023255813954, "grad_norm": 0.9066198468208313, "learning_rate": 2e-05, "loss": 0.7118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3534, "tokens_per_second_per_gpu": 18151.47, "total_tokens": 349106411 }, { "epoch": 0.22099274818704676, "grad_norm": 0.9010026454925537, "learning_rate": 2e-05, "loss": 0.6801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3535, "tokens_per_second_per_gpu": 15752.78, "total_tokens": 349202632 }, { "epoch": 0.221055263815954, "grad_norm": 0.9758462905883789, "learning_rate": 2e-05, "loss": 0.7051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3536, "tokens_per_second_per_gpu": 17734.03, "total_tokens": 349302880 }, { "epoch": 0.22111777944486122, "grad_norm": 0.9492954611778259, "learning_rate": 2e-05, "loss": 0.6962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3537, "tokens_per_second_per_gpu": 17955.14, "total_tokens": 349402049 }, { "epoch": 0.22118029507376843, "grad_norm": 0.8947362303733826, "learning_rate": 2e-05, "loss": 0.6742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3538, "tokens_per_second_per_gpu": 17536.84, "total_tokens": 349502708 }, { "epoch": 0.22124281070267568, "grad_norm": 0.9329335689544678, "learning_rate": 2e-05, "loss": 0.7199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3539, "tokens_per_second_per_gpu": 17320.27, "total_tokens": 349602073 }, { "epoch": 0.2213053263315829, "grad_norm": 0.959406852722168, "learning_rate": 2e-05, "loss": 0.684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3540, "tokens_per_second_per_gpu": 16710.6, "total_tokens": 349699273 }, { "epoch": 0.2213678419604901, "grad_norm": 0.8579906821250916, "learning_rate": 2e-05, "loss": 0.6905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3541, "tokens_per_second_per_gpu": 17763.09, "total_tokens": 349798654 }, { "epoch": 0.22143035758939736, "grad_norm": 0.9180085062980652, "learning_rate": 2e-05, "loss": 0.7402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3542, "tokens_per_second_per_gpu": 17007.83, "total_tokens": 349895234 }, { "epoch": 0.22149287321830458, "grad_norm": 1.0023205280303955, "learning_rate": 2e-05, "loss": 0.7744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3543, "tokens_per_second_per_gpu": 15238.0, "total_tokens": 349987162 }, { "epoch": 0.2215553888472118, "grad_norm": 1.0407902002334595, "learning_rate": 2e-05, "loss": 0.677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3544, "tokens_per_second_per_gpu": 17337.43, "total_tokens": 350086535 }, { "epoch": 0.22161790447611904, "grad_norm": 0.8640415668487549, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3545, "tokens_per_second_per_gpu": 16676.34, "total_tokens": 350182570 }, { "epoch": 0.22168042010502625, "grad_norm": 0.8992143273353577, "learning_rate": 2e-05, "loss": 0.7154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3546, "tokens_per_second_per_gpu": 17848.23, "total_tokens": 350284363 }, { "epoch": 0.22174293573393347, "grad_norm": 0.9448902010917664, "learning_rate": 2e-05, "loss": 0.7265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3547, "tokens_per_second_per_gpu": 18346.69, "total_tokens": 350385170 }, { "epoch": 0.22180545136284072, "grad_norm": 0.9148391485214233, "learning_rate": 2e-05, "loss": 0.7543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3548, "tokens_per_second_per_gpu": 16913.32, "total_tokens": 350483017 }, { "epoch": 0.22186796699174793, "grad_norm": 0.9071739912033081, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3549, "tokens_per_second_per_gpu": 15554.23, "total_tokens": 350573786 }, { "epoch": 0.22193048262065515, "grad_norm": 0.9396986961364746, "learning_rate": 2e-05, "loss": 0.7021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3550, "tokens_per_second_per_gpu": 16497.77, "total_tokens": 350670437 }, { "epoch": 0.2219929982495624, "grad_norm": 0.886664092540741, "learning_rate": 2e-05, "loss": 0.7229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3551, "tokens_per_second_per_gpu": 17968.69, "total_tokens": 350771966 }, { "epoch": 0.2220555138784696, "grad_norm": 0.8865143060684204, "learning_rate": 2e-05, "loss": 0.6901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3552, "tokens_per_second_per_gpu": 17657.21, "total_tokens": 350872023 }, { "epoch": 0.22211802950737686, "grad_norm": 0.9119451642036438, "learning_rate": 2e-05, "loss": 0.7219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3553, "tokens_per_second_per_gpu": 16923.24, "total_tokens": 350973519 }, { "epoch": 0.22218054513628407, "grad_norm": 0.9714253544807434, "learning_rate": 2e-05, "loss": 0.7425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3554, "tokens_per_second_per_gpu": 18494.26, "total_tokens": 351076400 }, { "epoch": 0.2222430607651913, "grad_norm": 0.8718903660774231, "learning_rate": 2e-05, "loss": 0.6819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3555, "tokens_per_second_per_gpu": 17715.51, "total_tokens": 351176547 }, { "epoch": 0.22230557639409854, "grad_norm": 0.9222926497459412, "learning_rate": 2e-05, "loss": 0.687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3556, "tokens_per_second_per_gpu": 17961.34, "total_tokens": 351277405 }, { "epoch": 0.22236809202300575, "grad_norm": 0.9626809358596802, "learning_rate": 2e-05, "loss": 0.6867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3557, "tokens_per_second_per_gpu": 18071.3, "total_tokens": 351374754 }, { "epoch": 0.22243060765191297, "grad_norm": 0.9711483120918274, "learning_rate": 2e-05, "loss": 0.7146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3558, "tokens_per_second_per_gpu": 15605.49, "total_tokens": 351470388 }, { "epoch": 0.22249312328082022, "grad_norm": 0.9812208414077759, "learning_rate": 2e-05, "loss": 0.6838, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3559, "tokens_per_second_per_gpu": 18160.95, "total_tokens": 351570995 }, { "epoch": 0.22255563890972743, "grad_norm": 0.9026910066604614, "learning_rate": 2e-05, "loss": 0.6881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3560, "tokens_per_second_per_gpu": 18113.3, "total_tokens": 351673625 }, { "epoch": 0.22261815453863465, "grad_norm": 0.9952516555786133, "learning_rate": 2e-05, "loss": 0.7156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3561, "tokens_per_second_per_gpu": 17842.44, "total_tokens": 351774154 }, { "epoch": 0.2226806701675419, "grad_norm": 0.9304971098899841, "learning_rate": 2e-05, "loss": 0.7119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3562, "tokens_per_second_per_gpu": 17579.79, "total_tokens": 351874463 }, { "epoch": 0.2227431857964491, "grad_norm": 0.9254277944564819, "learning_rate": 2e-05, "loss": 0.6878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3563, "tokens_per_second_per_gpu": 17405.84, "total_tokens": 351971444 }, { "epoch": 0.22280570142535633, "grad_norm": 0.8899314403533936, "learning_rate": 2e-05, "loss": 0.6896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3564, "tokens_per_second_per_gpu": 18047.6, "total_tokens": 352075440 }, { "epoch": 0.22286821705426357, "grad_norm": 0.9958744645118713, "learning_rate": 2e-05, "loss": 0.7353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3565, "tokens_per_second_per_gpu": 17159.04, "total_tokens": 352172662 }, { "epoch": 0.2229307326831708, "grad_norm": 0.911453127861023, "learning_rate": 2e-05, "loss": 0.755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3566, "tokens_per_second_per_gpu": 18353.22, "total_tokens": 352273720 }, { "epoch": 0.222993248312078, "grad_norm": 0.8846615552902222, "learning_rate": 2e-05, "loss": 0.6713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3567, "tokens_per_second_per_gpu": 18341.94, "total_tokens": 352371187 }, { "epoch": 0.22305576394098525, "grad_norm": 0.8932715654373169, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3568, "tokens_per_second_per_gpu": 17811.49, "total_tokens": 352471624 }, { "epoch": 0.22311827956989247, "grad_norm": 0.8652172684669495, "learning_rate": 2e-05, "loss": 0.6852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3569, "tokens_per_second_per_gpu": 17006.64, "total_tokens": 352571064 }, { "epoch": 0.2231807951987997, "grad_norm": 0.8797615766525269, "learning_rate": 2e-05, "loss": 0.7046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3570, "tokens_per_second_per_gpu": 18746.09, "total_tokens": 352677044 }, { "epoch": 0.22324331082770693, "grad_norm": 0.8948146104812622, "learning_rate": 2e-05, "loss": 0.7001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3571, "tokens_per_second_per_gpu": 17992.61, "total_tokens": 352775262 }, { "epoch": 0.22330582645661415, "grad_norm": 0.9457796812057495, "learning_rate": 2e-05, "loss": 0.7284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3572, "tokens_per_second_per_gpu": 16533.71, "total_tokens": 352869832 }, { "epoch": 0.2233683420855214, "grad_norm": 0.9040939211845398, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3573, "tokens_per_second_per_gpu": 18094.81, "total_tokens": 352966519 }, { "epoch": 0.2234308577144286, "grad_norm": 0.9327536821365356, "learning_rate": 2e-05, "loss": 0.7266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3574, "tokens_per_second_per_gpu": 17848.61, "total_tokens": 353066266 }, { "epoch": 0.22349337334333583, "grad_norm": 0.8930810689926147, "learning_rate": 2e-05, "loss": 0.668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3575, "tokens_per_second_per_gpu": 16729.71, "total_tokens": 353164799 }, { "epoch": 0.22355588897224307, "grad_norm": 0.9296519756317139, "learning_rate": 2e-05, "loss": 0.7103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3576, "tokens_per_second_per_gpu": 17403.92, "total_tokens": 353264212 }, { "epoch": 0.2236184046011503, "grad_norm": 0.8812124133110046, "learning_rate": 2e-05, "loss": 0.7213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3577, "tokens_per_second_per_gpu": 17273.17, "total_tokens": 353363417 }, { "epoch": 0.2236809202300575, "grad_norm": 0.9651620984077454, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3578, "tokens_per_second_per_gpu": 15814.02, "total_tokens": 353456525 }, { "epoch": 0.22374343585896475, "grad_norm": 0.9264265298843384, "learning_rate": 2e-05, "loss": 0.6973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3579, "tokens_per_second_per_gpu": 16631.79, "total_tokens": 353555561 }, { "epoch": 0.22380595148787197, "grad_norm": 0.8655561208724976, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3580, "tokens_per_second_per_gpu": 17289.51, "total_tokens": 353656086 }, { "epoch": 0.22386846711677919, "grad_norm": 0.9058122634887695, "learning_rate": 2e-05, "loss": 0.6872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3581, "tokens_per_second_per_gpu": 16533.73, "total_tokens": 353755509 }, { "epoch": 0.22393098274568643, "grad_norm": 0.8976870775222778, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3582, "tokens_per_second_per_gpu": 18120.46, "total_tokens": 353856416 }, { "epoch": 0.22399349837459365, "grad_norm": 0.8767185807228088, "learning_rate": 2e-05, "loss": 0.7065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3583, "tokens_per_second_per_gpu": 18727.07, "total_tokens": 353957731 }, { "epoch": 0.22405601400350086, "grad_norm": 0.9066479206085205, "learning_rate": 2e-05, "loss": 0.7307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3584, "tokens_per_second_per_gpu": 17839.79, "total_tokens": 354058931 }, { "epoch": 0.2241185296324081, "grad_norm": 0.9221702218055725, "learning_rate": 2e-05, "loss": 0.667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3585, "tokens_per_second_per_gpu": 16899.61, "total_tokens": 354153599 }, { "epoch": 0.22418104526131533, "grad_norm": 0.8756466507911682, "learning_rate": 2e-05, "loss": 0.6837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3586, "tokens_per_second_per_gpu": 18318.09, "total_tokens": 354254932 }, { "epoch": 0.22424356089022254, "grad_norm": 0.8867779970169067, "learning_rate": 2e-05, "loss": 0.7124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3587, "tokens_per_second_per_gpu": 18429.68, "total_tokens": 354359211 }, { "epoch": 0.2243060765191298, "grad_norm": 0.8956486582756042, "learning_rate": 2e-05, "loss": 0.6685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3588, "tokens_per_second_per_gpu": 17762.9, "total_tokens": 354459601 }, { "epoch": 0.224368592148037, "grad_norm": 0.883808970451355, "learning_rate": 2e-05, "loss": 0.6883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3589, "tokens_per_second_per_gpu": 17719.59, "total_tokens": 354559397 }, { "epoch": 0.22443110777694425, "grad_norm": 0.8869337439537048, "learning_rate": 2e-05, "loss": 0.6756, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3590, "tokens_per_second_per_gpu": 17813.14, "total_tokens": 354658763 }, { "epoch": 0.22449362340585147, "grad_norm": 0.9285633563995361, "learning_rate": 2e-05, "loss": 0.7155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3591, "tokens_per_second_per_gpu": 16096.75, "total_tokens": 354754730 }, { "epoch": 0.22455613903475868, "grad_norm": 0.8801113963127136, "learning_rate": 2e-05, "loss": 0.7192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3592, "tokens_per_second_per_gpu": 17447.85, "total_tokens": 354853021 }, { "epoch": 0.22461865466366593, "grad_norm": 0.9165838956832886, "learning_rate": 2e-05, "loss": 0.7322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3593, "tokens_per_second_per_gpu": 18033.21, "total_tokens": 354953726 }, { "epoch": 0.22468117029257315, "grad_norm": 0.9090580940246582, "learning_rate": 2e-05, "loss": 0.7181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3594, "tokens_per_second_per_gpu": 17622.3, "total_tokens": 355050515 }, { "epoch": 0.22474368592148036, "grad_norm": 0.8931262493133545, "learning_rate": 2e-05, "loss": 0.6841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3595, "tokens_per_second_per_gpu": 17944.51, "total_tokens": 355150475 }, { "epoch": 0.2248062015503876, "grad_norm": 0.890306830406189, "learning_rate": 2e-05, "loss": 0.7146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3596, "tokens_per_second_per_gpu": 17935.91, "total_tokens": 355250034 }, { "epoch": 0.22486871717929482, "grad_norm": 0.8878995180130005, "learning_rate": 2e-05, "loss": 0.7246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3597, "tokens_per_second_per_gpu": 17752.47, "total_tokens": 355350961 }, { "epoch": 0.22493123280820204, "grad_norm": 0.8987960815429688, "learning_rate": 2e-05, "loss": 0.6996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3598, "tokens_per_second_per_gpu": 17087.46, "total_tokens": 355449947 }, { "epoch": 0.2249937484371093, "grad_norm": 0.8982648849487305, "learning_rate": 2e-05, "loss": 0.6954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3599, "tokens_per_second_per_gpu": 16305.66, "total_tokens": 355548752 }, { "epoch": 0.2250562640660165, "grad_norm": 0.8929717540740967, "learning_rate": 2e-05, "loss": 0.7166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3600, "tokens_per_second_per_gpu": 17557.89, "total_tokens": 355650353 }, { "epoch": 0.22511877969492372, "grad_norm": 0.8678832054138184, "learning_rate": 2e-05, "loss": 0.6976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3601, "tokens_per_second_per_gpu": 17717.77, "total_tokens": 355751041 }, { "epoch": 0.22518129532383097, "grad_norm": 0.8988049626350403, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3602, "tokens_per_second_per_gpu": 18124.14, "total_tokens": 355849362 }, { "epoch": 0.22524381095273818, "grad_norm": 0.8932735919952393, "learning_rate": 2e-05, "loss": 0.7275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3603, "tokens_per_second_per_gpu": 17996.79, "total_tokens": 355952021 }, { "epoch": 0.2253063265816454, "grad_norm": 0.9466664791107178, "learning_rate": 2e-05, "loss": 0.7544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3604, "tokens_per_second_per_gpu": 17605.98, "total_tokens": 356053407 }, { "epoch": 0.22536884221055264, "grad_norm": 0.9150352478027344, "learning_rate": 2e-05, "loss": 0.6985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3605, "tokens_per_second_per_gpu": 16702.08, "total_tokens": 356149729 }, { "epoch": 0.22543135783945986, "grad_norm": 0.8912304639816284, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3606, "tokens_per_second_per_gpu": 18089.24, "total_tokens": 356247788 }, { "epoch": 0.22549387346836708, "grad_norm": 0.9282631278038025, "learning_rate": 2e-05, "loss": 0.7447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3607, "tokens_per_second_per_gpu": 17441.09, "total_tokens": 356346085 }, { "epoch": 0.22555638909727432, "grad_norm": 0.8822139501571655, "learning_rate": 2e-05, "loss": 0.7332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3608, "tokens_per_second_per_gpu": 18495.36, "total_tokens": 356447965 }, { "epoch": 0.22561890472618154, "grad_norm": 0.931221067905426, "learning_rate": 2e-05, "loss": 0.7027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3609, "tokens_per_second_per_gpu": 17350.27, "total_tokens": 356543767 }, { "epoch": 0.22568142035508879, "grad_norm": 0.9204666614532471, "learning_rate": 2e-05, "loss": 0.7237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3610, "tokens_per_second_per_gpu": 18050.78, "total_tokens": 356647358 }, { "epoch": 0.225743935983996, "grad_norm": 0.9099909663200378, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3611, "tokens_per_second_per_gpu": 16730.47, "total_tokens": 356742101 }, { "epoch": 0.22580645161290322, "grad_norm": 0.9005494117736816, "learning_rate": 2e-05, "loss": 0.6818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3612, "tokens_per_second_per_gpu": 16552.77, "total_tokens": 356837953 }, { "epoch": 0.22586896724181046, "grad_norm": 0.9396748542785645, "learning_rate": 2e-05, "loss": 0.6991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3613, "tokens_per_second_per_gpu": 18323.15, "total_tokens": 356939079 }, { "epoch": 0.22593148287071768, "grad_norm": 0.9227089881896973, "learning_rate": 2e-05, "loss": 0.6756, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3614, "tokens_per_second_per_gpu": 17204.7, "total_tokens": 357035684 }, { "epoch": 0.2259939984996249, "grad_norm": 0.9052592515945435, "learning_rate": 2e-05, "loss": 0.6847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3615, "tokens_per_second_per_gpu": 16491.3, "total_tokens": 357132130 }, { "epoch": 0.22605651412853214, "grad_norm": 0.9340832233428955, "learning_rate": 2e-05, "loss": 0.6884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3616, "tokens_per_second_per_gpu": 17144.82, "total_tokens": 357227321 }, { "epoch": 0.22611902975743936, "grad_norm": 0.8866902589797974, "learning_rate": 2e-05, "loss": 0.7065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3617, "tokens_per_second_per_gpu": 18090.11, "total_tokens": 357327845 }, { "epoch": 0.22618154538634658, "grad_norm": 0.9877777695655823, "learning_rate": 2e-05, "loss": 0.6738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3618, "tokens_per_second_per_gpu": 17454.03, "total_tokens": 357425570 }, { "epoch": 0.22624406101525382, "grad_norm": 0.8814852237701416, "learning_rate": 2e-05, "loss": 0.6892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3619, "tokens_per_second_per_gpu": 17880.57, "total_tokens": 357523686 }, { "epoch": 0.22630657664416104, "grad_norm": 0.9261888265609741, "learning_rate": 2e-05, "loss": 0.6889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3620, "tokens_per_second_per_gpu": 16839.88, "total_tokens": 357617137 }, { "epoch": 0.22636909227306826, "grad_norm": 0.9375717639923096, "learning_rate": 2e-05, "loss": 0.7217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3621, "tokens_per_second_per_gpu": 16662.96, "total_tokens": 357711864 }, { "epoch": 0.2264316079019755, "grad_norm": 0.9971256852149963, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3622, "tokens_per_second_per_gpu": 14899.4, "total_tokens": 357804075 }, { "epoch": 0.22649412353088272, "grad_norm": 0.959862470626831, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3623, "tokens_per_second_per_gpu": 16868.57, "total_tokens": 357899702 }, { "epoch": 0.22655663915978994, "grad_norm": 0.8716877102851868, "learning_rate": 2e-05, "loss": 0.6815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3624, "tokens_per_second_per_gpu": 17291.1, "total_tokens": 357996315 }, { "epoch": 0.22661915478869718, "grad_norm": 0.9430704116821289, "learning_rate": 2e-05, "loss": 0.6604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3625, "tokens_per_second_per_gpu": 16919.24, "total_tokens": 358093131 }, { "epoch": 0.2266816704176044, "grad_norm": 0.9039745926856995, "learning_rate": 2e-05, "loss": 0.6661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3626, "tokens_per_second_per_gpu": 17031.38, "total_tokens": 358192061 }, { "epoch": 0.22674418604651161, "grad_norm": 0.9427763223648071, "learning_rate": 2e-05, "loss": 0.7364, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3627, "tokens_per_second_per_gpu": 16762.61, "total_tokens": 358291754 }, { "epoch": 0.22680670167541886, "grad_norm": 0.9311524629592896, "learning_rate": 2e-05, "loss": 0.7224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3628, "tokens_per_second_per_gpu": 17866.05, "total_tokens": 358392440 }, { "epoch": 0.22686921730432608, "grad_norm": 0.9867621064186096, "learning_rate": 2e-05, "loss": 0.659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3629, "tokens_per_second_per_gpu": 16489.3, "total_tokens": 358487467 }, { "epoch": 0.22693173293323332, "grad_norm": 0.911679208278656, "learning_rate": 2e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3630, "tokens_per_second_per_gpu": 17646.3, "total_tokens": 358583922 }, { "epoch": 0.22699424856214054, "grad_norm": 0.9368058443069458, "learning_rate": 2e-05, "loss": 0.7306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3631, "tokens_per_second_per_gpu": 17549.59, "total_tokens": 358684361 }, { "epoch": 0.22705676419104776, "grad_norm": 0.952573299407959, "learning_rate": 2e-05, "loss": 0.6964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3632, "tokens_per_second_per_gpu": 17996.39, "total_tokens": 358781987 }, { "epoch": 0.227119279819955, "grad_norm": 0.9018346667289734, "learning_rate": 2e-05, "loss": 0.7094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3633, "tokens_per_second_per_gpu": 19274.89, "total_tokens": 358885935 }, { "epoch": 0.22718179544886222, "grad_norm": 0.8949706554412842, "learning_rate": 2e-05, "loss": 0.7109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3634, "tokens_per_second_per_gpu": 17748.93, "total_tokens": 358987373 }, { "epoch": 0.22724431107776943, "grad_norm": 0.9319155812263489, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3635, "tokens_per_second_per_gpu": 17754.56, "total_tokens": 359087853 }, { "epoch": 0.22730682670667668, "grad_norm": 0.9491362571716309, "learning_rate": 2e-05, "loss": 0.7031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3636, "tokens_per_second_per_gpu": 16738.75, "total_tokens": 359185794 }, { "epoch": 0.2273693423355839, "grad_norm": 0.9104936122894287, "learning_rate": 2e-05, "loss": 0.7321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3637, "tokens_per_second_per_gpu": 16756.55, "total_tokens": 359286306 }, { "epoch": 0.2274318579644911, "grad_norm": 0.8913196921348572, "learning_rate": 2e-05, "loss": 0.6744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3638, "tokens_per_second_per_gpu": 17359.46, "total_tokens": 359385767 }, { "epoch": 0.22749437359339836, "grad_norm": 0.9661215543746948, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3639, "tokens_per_second_per_gpu": 17096.83, "total_tokens": 359483089 }, { "epoch": 0.22755688922230558, "grad_norm": 0.9578418135643005, "learning_rate": 2e-05, "loss": 0.6647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3640, "tokens_per_second_per_gpu": 17032.12, "total_tokens": 359580479 }, { "epoch": 0.2276194048512128, "grad_norm": 0.9661098718643188, "learning_rate": 2e-05, "loss": 0.7145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3641, "tokens_per_second_per_gpu": 17686.08, "total_tokens": 359680338 }, { "epoch": 0.22768192048012004, "grad_norm": 0.9712583422660828, "learning_rate": 2e-05, "loss": 0.7207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3642, "tokens_per_second_per_gpu": 18022.17, "total_tokens": 359781706 }, { "epoch": 0.22774443610902725, "grad_norm": 0.9255023002624512, "learning_rate": 2e-05, "loss": 0.6726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3643, "tokens_per_second_per_gpu": 19223.81, "total_tokens": 359883569 }, { "epoch": 0.22780695173793447, "grad_norm": 0.9425341486930847, "learning_rate": 2e-05, "loss": 0.6659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3644, "tokens_per_second_per_gpu": 16329.8, "total_tokens": 359977692 }, { "epoch": 0.22786946736684172, "grad_norm": 0.9004485607147217, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3645, "tokens_per_second_per_gpu": 16556.5, "total_tokens": 360071673 }, { "epoch": 0.22793198299574893, "grad_norm": 0.8593522310256958, "learning_rate": 2e-05, "loss": 0.6428, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3646, "tokens_per_second_per_gpu": 16884.64, "total_tokens": 360167913 }, { "epoch": 0.22799449862465618, "grad_norm": 0.9253976941108704, "learning_rate": 2e-05, "loss": 0.7028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3647, "tokens_per_second_per_gpu": 17632.6, "total_tokens": 360263587 }, { "epoch": 0.2280570142535634, "grad_norm": 0.9950152635574341, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3648, "tokens_per_second_per_gpu": 15942.6, "total_tokens": 360358564 }, { "epoch": 0.2281195298824706, "grad_norm": 0.9395313262939453, "learning_rate": 2e-05, "loss": 0.7464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3649, "tokens_per_second_per_gpu": 18752.65, "total_tokens": 360460011 }, { "epoch": 0.22818204551137786, "grad_norm": 0.9102711081504822, "learning_rate": 2e-05, "loss": 0.675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3650, "tokens_per_second_per_gpu": 17307.57, "total_tokens": 360556234 }, { "epoch": 0.22824456114028507, "grad_norm": 0.8656097650527954, "learning_rate": 2e-05, "loss": 0.7065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3651, "tokens_per_second_per_gpu": 18153.17, "total_tokens": 360661098 }, { "epoch": 0.2283070767691923, "grad_norm": 0.9479295015335083, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3652, "tokens_per_second_per_gpu": 16020.06, "total_tokens": 360756606 }, { "epoch": 0.22836959239809954, "grad_norm": 0.9912386536598206, "learning_rate": 2e-05, "loss": 0.7458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3653, "tokens_per_second_per_gpu": 17013.85, "total_tokens": 360850531 }, { "epoch": 0.22843210802700675, "grad_norm": 0.9049780964851379, "learning_rate": 2e-05, "loss": 0.7143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3654, "tokens_per_second_per_gpu": 16886.98, "total_tokens": 360950682 }, { "epoch": 0.22849462365591397, "grad_norm": 0.8991239070892334, "learning_rate": 2e-05, "loss": 0.7433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3655, "tokens_per_second_per_gpu": 17324.88, "total_tokens": 361053360 }, { "epoch": 0.22855713928482121, "grad_norm": 0.9147327542304993, "learning_rate": 2e-05, "loss": 0.7434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3656, "tokens_per_second_per_gpu": 18528.38, "total_tokens": 361156508 }, { "epoch": 0.22861965491372843, "grad_norm": 0.9181051850318909, "learning_rate": 2e-05, "loss": 0.7118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3657, "tokens_per_second_per_gpu": 19364.65, "total_tokens": 361262007 }, { "epoch": 0.22868217054263565, "grad_norm": 0.9247111678123474, "learning_rate": 2e-05, "loss": 0.7074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3658, "tokens_per_second_per_gpu": 18248.43, "total_tokens": 361364423 }, { "epoch": 0.2287446861715429, "grad_norm": 0.9294692277908325, "learning_rate": 2e-05, "loss": 0.6898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3659, "tokens_per_second_per_gpu": 17884.15, "total_tokens": 361462067 }, { "epoch": 0.2288072018004501, "grad_norm": 0.8933585286140442, "learning_rate": 2e-05, "loss": 0.6739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3660, "tokens_per_second_per_gpu": 16860.91, "total_tokens": 361559909 }, { "epoch": 0.22886971742935733, "grad_norm": 0.9383262991905212, "learning_rate": 2e-05, "loss": 0.7105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3661, "tokens_per_second_per_gpu": 17812.25, "total_tokens": 361662187 }, { "epoch": 0.22893223305826457, "grad_norm": 0.9049383997917175, "learning_rate": 2e-05, "loss": 0.7035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3662, "tokens_per_second_per_gpu": 18257.33, "total_tokens": 361763464 }, { "epoch": 0.2289947486871718, "grad_norm": 0.8854354023933411, "learning_rate": 2e-05, "loss": 0.6572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3663, "tokens_per_second_per_gpu": 16729.2, "total_tokens": 361856800 }, { "epoch": 0.229057264316079, "grad_norm": 0.8769302368164062, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3664, "tokens_per_second_per_gpu": 18037.94, "total_tokens": 361957188 }, { "epoch": 0.22911977994498625, "grad_norm": 0.860315203666687, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3665, "tokens_per_second_per_gpu": 17412.27, "total_tokens": 362056612 }, { "epoch": 0.22918229557389347, "grad_norm": 0.9057541489601135, "learning_rate": 2e-05, "loss": 0.7085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3666, "tokens_per_second_per_gpu": 18224.85, "total_tokens": 362155030 }, { "epoch": 0.2292448112028007, "grad_norm": 0.9592877626419067, "learning_rate": 2e-05, "loss": 0.7272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3667, "tokens_per_second_per_gpu": 18179.7, "total_tokens": 362257470 }, { "epoch": 0.22930732683170793, "grad_norm": 0.9472943544387817, "learning_rate": 2e-05, "loss": 0.7213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3668, "tokens_per_second_per_gpu": 17461.26, "total_tokens": 362355457 }, { "epoch": 0.22936984246061515, "grad_norm": 0.9023430943489075, "learning_rate": 2e-05, "loss": 0.7132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3669, "tokens_per_second_per_gpu": 17380.82, "total_tokens": 362452817 }, { "epoch": 0.2294323580895224, "grad_norm": 0.9029287099838257, "learning_rate": 2e-05, "loss": 0.6733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3670, "tokens_per_second_per_gpu": 16823.6, "total_tokens": 362550252 }, { "epoch": 0.2294948737184296, "grad_norm": 0.9128903150558472, "learning_rate": 2e-05, "loss": 0.758, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3671, "tokens_per_second_per_gpu": 18073.45, "total_tokens": 362651855 }, { "epoch": 0.22955738934733683, "grad_norm": 0.9143405556678772, "learning_rate": 2e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3672, "tokens_per_second_per_gpu": 17482.19, "total_tokens": 362749394 }, { "epoch": 0.22961990497624407, "grad_norm": 0.9224843978881836, "learning_rate": 2e-05, "loss": 0.7178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3673, "tokens_per_second_per_gpu": 17705.24, "total_tokens": 362847989 }, { "epoch": 0.2296824206051513, "grad_norm": 0.9034528732299805, "learning_rate": 2e-05, "loss": 0.7155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3674, "tokens_per_second_per_gpu": 17048.59, "total_tokens": 362947996 }, { "epoch": 0.2297449362340585, "grad_norm": 0.9011315703392029, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3675, "tokens_per_second_per_gpu": 17541.68, "total_tokens": 363045142 }, { "epoch": 0.22980745186296575, "grad_norm": 0.8785156607627869, "learning_rate": 2e-05, "loss": 0.6714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3676, "tokens_per_second_per_gpu": 17434.08, "total_tokens": 363145461 }, { "epoch": 0.22986996749187297, "grad_norm": 0.9083364009857178, "learning_rate": 2e-05, "loss": 0.7621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3677, "tokens_per_second_per_gpu": 18049.46, "total_tokens": 363249757 }, { "epoch": 0.22993248312078018, "grad_norm": 1.005993366241455, "learning_rate": 2e-05, "loss": 0.7053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3678, "tokens_per_second_per_gpu": 17995.02, "total_tokens": 363348926 }, { "epoch": 0.22999499874968743, "grad_norm": 0.908791184425354, "learning_rate": 2e-05, "loss": 0.7133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3679, "tokens_per_second_per_gpu": 18265.12, "total_tokens": 363449593 }, { "epoch": 0.23005751437859465, "grad_norm": 0.8960698246955872, "learning_rate": 2e-05, "loss": 0.7047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3680, "tokens_per_second_per_gpu": 17613.53, "total_tokens": 363552448 }, { "epoch": 0.23012003000750186, "grad_norm": 0.9418147802352905, "learning_rate": 2e-05, "loss": 0.7471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3681, "tokens_per_second_per_gpu": 17814.96, "total_tokens": 363649823 }, { "epoch": 0.2301825456364091, "grad_norm": 0.8925384879112244, "learning_rate": 2e-05, "loss": 0.6833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3682, "tokens_per_second_per_gpu": 18903.65, "total_tokens": 363754917 }, { "epoch": 0.23024506126531633, "grad_norm": 0.9122097492218018, "learning_rate": 2e-05, "loss": 0.6919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3683, "tokens_per_second_per_gpu": 18357.61, "total_tokens": 363856592 }, { "epoch": 0.23030757689422354, "grad_norm": 0.920913815498352, "learning_rate": 2e-05, "loss": 0.6856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3684, "tokens_per_second_per_gpu": 17578.62, "total_tokens": 363955790 }, { "epoch": 0.2303700925231308, "grad_norm": 0.8980070352554321, "learning_rate": 2e-05, "loss": 0.712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3685, "tokens_per_second_per_gpu": 17566.79, "total_tokens": 364054630 }, { "epoch": 0.230432608152038, "grad_norm": 0.8798218369483948, "learning_rate": 2e-05, "loss": 0.7237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3686, "tokens_per_second_per_gpu": 18028.31, "total_tokens": 364158220 }, { "epoch": 0.23049512378094525, "grad_norm": 0.9025009274482727, "learning_rate": 2e-05, "loss": 0.6807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3687, "tokens_per_second_per_gpu": 15931.5, "total_tokens": 364253310 }, { "epoch": 0.23055763940985247, "grad_norm": 0.8821442127227783, "learning_rate": 2e-05, "loss": 0.6951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3688, "tokens_per_second_per_gpu": 17479.48, "total_tokens": 364353587 }, { "epoch": 0.23062015503875968, "grad_norm": 0.8932574391365051, "learning_rate": 2e-05, "loss": 0.694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3689, "tokens_per_second_per_gpu": 17764.3, "total_tokens": 364452431 }, { "epoch": 0.23068267066766693, "grad_norm": 0.9163625836372375, "learning_rate": 2e-05, "loss": 0.6909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3690, "tokens_per_second_per_gpu": 17143.29, "total_tokens": 364550601 }, { "epoch": 0.23074518629657415, "grad_norm": 0.9008880257606506, "learning_rate": 2e-05, "loss": 0.7102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3691, "tokens_per_second_per_gpu": 17783.24, "total_tokens": 364651659 }, { "epoch": 0.23080770192548136, "grad_norm": 0.8992222547531128, "learning_rate": 2e-05, "loss": 0.7018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3692, "tokens_per_second_per_gpu": 18307.77, "total_tokens": 364752236 }, { "epoch": 0.2308702175543886, "grad_norm": 0.8739280104637146, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3693, "tokens_per_second_per_gpu": 15778.17, "total_tokens": 364849289 }, { "epoch": 0.23093273318329582, "grad_norm": 0.8878238797187805, "learning_rate": 2e-05, "loss": 0.7233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3694, "tokens_per_second_per_gpu": 18147.23, "total_tokens": 364948741 }, { "epoch": 0.23099524881220304, "grad_norm": 0.923703670501709, "learning_rate": 2e-05, "loss": 0.7181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3695, "tokens_per_second_per_gpu": 17567.97, "total_tokens": 365047589 }, { "epoch": 0.23105776444111029, "grad_norm": 0.875084400177002, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3696, "tokens_per_second_per_gpu": 18158.01, "total_tokens": 365147858 }, { "epoch": 0.2311202800700175, "grad_norm": 0.8942461013793945, "learning_rate": 2e-05, "loss": 0.6672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3697, "tokens_per_second_per_gpu": 16447.34, "total_tokens": 365243605 }, { "epoch": 0.23118279569892472, "grad_norm": 0.9329925179481506, "learning_rate": 2e-05, "loss": 0.7331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3698, "tokens_per_second_per_gpu": 18087.35, "total_tokens": 365347189 }, { "epoch": 0.23124531132783196, "grad_norm": 0.8636906743049622, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3699, "tokens_per_second_per_gpu": 18111.52, "total_tokens": 365446342 }, { "epoch": 0.23130782695673918, "grad_norm": 0.9016188979148865, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3700, "tokens_per_second_per_gpu": 16630.16, "total_tokens": 365542745 }, { "epoch": 0.2313703425856464, "grad_norm": 0.9697847366333008, "learning_rate": 2e-05, "loss": 0.7047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3701, "tokens_per_second_per_gpu": 17150.04, "total_tokens": 365638960 }, { "epoch": 0.23143285821455364, "grad_norm": 0.9278713464736938, "learning_rate": 2e-05, "loss": 0.6795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3702, "tokens_per_second_per_gpu": 17241.54, "total_tokens": 365738067 }, { "epoch": 0.23149537384346086, "grad_norm": 0.8883047103881836, "learning_rate": 2e-05, "loss": 0.6582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3703, "tokens_per_second_per_gpu": 18027.08, "total_tokens": 365839521 }, { "epoch": 0.2315578894723681, "grad_norm": 0.8933744430541992, "learning_rate": 2e-05, "loss": 0.6954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3704, "tokens_per_second_per_gpu": 17361.93, "total_tokens": 365941204 }, { "epoch": 0.23162040510127532, "grad_norm": 0.9280262589454651, "learning_rate": 2e-05, "loss": 0.696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3705, "tokens_per_second_per_gpu": 16983.63, "total_tokens": 366039608 }, { "epoch": 0.23168292073018254, "grad_norm": 0.9082515239715576, "learning_rate": 2e-05, "loss": 0.7034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3706, "tokens_per_second_per_gpu": 17259.75, "total_tokens": 366136736 }, { "epoch": 0.23174543635908978, "grad_norm": 0.8780248165130615, "learning_rate": 2e-05, "loss": 0.6827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3707, "tokens_per_second_per_gpu": 17214.29, "total_tokens": 366234171 }, { "epoch": 0.231807951987997, "grad_norm": 0.8978534936904907, "learning_rate": 2e-05, "loss": 0.6702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3708, "tokens_per_second_per_gpu": 16998.82, "total_tokens": 366330488 }, { "epoch": 0.23187046761690422, "grad_norm": 0.9114447236061096, "learning_rate": 2e-05, "loss": 0.7444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3709, "tokens_per_second_per_gpu": 18407.04, "total_tokens": 366432631 }, { "epoch": 0.23193298324581146, "grad_norm": 0.9170315861701965, "learning_rate": 2e-05, "loss": 0.6845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3710, "tokens_per_second_per_gpu": 16199.64, "total_tokens": 366529016 }, { "epoch": 0.23199549887471868, "grad_norm": 0.9279804229736328, "learning_rate": 2e-05, "loss": 0.7157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3711, "tokens_per_second_per_gpu": 16472.82, "total_tokens": 366622944 }, { "epoch": 0.2320580145036259, "grad_norm": 0.8924099802970886, "learning_rate": 2e-05, "loss": 0.6918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3712, "tokens_per_second_per_gpu": 17240.86, "total_tokens": 366722990 }, { "epoch": 0.23212053013253314, "grad_norm": 0.9028686881065369, "learning_rate": 2e-05, "loss": 0.7482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3713, "tokens_per_second_per_gpu": 18275.57, "total_tokens": 366823614 }, { "epoch": 0.23218304576144036, "grad_norm": 0.9564517140388489, "learning_rate": 2e-05, "loss": 0.6929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3714, "tokens_per_second_per_gpu": 17012.8, "total_tokens": 366922113 }, { "epoch": 0.23224556139034758, "grad_norm": 0.9018860459327698, "learning_rate": 2e-05, "loss": 0.6742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3715, "tokens_per_second_per_gpu": 17297.61, "total_tokens": 367018525 }, { "epoch": 0.23230807701925482, "grad_norm": 0.8815640807151794, "learning_rate": 2e-05, "loss": 0.6952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3716, "tokens_per_second_per_gpu": 17810.39, "total_tokens": 367120167 }, { "epoch": 0.23237059264816204, "grad_norm": 0.9015478491783142, "learning_rate": 2e-05, "loss": 0.6992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3717, "tokens_per_second_per_gpu": 18061.6, "total_tokens": 367223284 }, { "epoch": 0.23243310827706926, "grad_norm": 0.9082337021827698, "learning_rate": 2e-05, "loss": 0.6873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3718, "tokens_per_second_per_gpu": 17706.17, "total_tokens": 367324445 }, { "epoch": 0.2324956239059765, "grad_norm": 0.9152607321739197, "learning_rate": 2e-05, "loss": 0.6961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3719, "tokens_per_second_per_gpu": 16158.76, "total_tokens": 367417809 }, { "epoch": 0.23255813953488372, "grad_norm": 0.8815982341766357, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3720, "tokens_per_second_per_gpu": 16730.01, "total_tokens": 367518611 }, { "epoch": 0.23262065516379093, "grad_norm": 0.8845086097717285, "learning_rate": 2e-05, "loss": 0.6867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3721, "tokens_per_second_per_gpu": 17089.81, "total_tokens": 367613282 }, { "epoch": 0.23268317079269818, "grad_norm": 0.8953279852867126, "learning_rate": 2e-05, "loss": 0.6639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3722, "tokens_per_second_per_gpu": 16735.91, "total_tokens": 367709132 }, { "epoch": 0.2327456864216054, "grad_norm": 0.8813169598579407, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3723, "tokens_per_second_per_gpu": 16998.62, "total_tokens": 367809290 }, { "epoch": 0.23280820205051264, "grad_norm": 0.9137061834335327, "learning_rate": 2e-05, "loss": 0.7403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3724, "tokens_per_second_per_gpu": 17760.14, "total_tokens": 367911985 }, { "epoch": 0.23287071767941986, "grad_norm": 0.9660826325416565, "learning_rate": 2e-05, "loss": 0.6936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3725, "tokens_per_second_per_gpu": 17130.13, "total_tokens": 368009118 }, { "epoch": 0.23293323330832708, "grad_norm": 0.9191729426383972, "learning_rate": 2e-05, "loss": 0.6817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3726, "tokens_per_second_per_gpu": 15869.18, "total_tokens": 368102661 }, { "epoch": 0.23299574893723432, "grad_norm": 0.9396277666091919, "learning_rate": 2e-05, "loss": 0.7081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3727, "tokens_per_second_per_gpu": 17182.76, "total_tokens": 368201346 }, { "epoch": 0.23305826456614154, "grad_norm": 0.9079470038414001, "learning_rate": 2e-05, "loss": 0.7105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3728, "tokens_per_second_per_gpu": 18793.44, "total_tokens": 368302625 }, { "epoch": 0.23312078019504875, "grad_norm": 0.895592987537384, "learning_rate": 2e-05, "loss": 0.6694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3729, "tokens_per_second_per_gpu": 17673.79, "total_tokens": 368400392 }, { "epoch": 0.233183295823956, "grad_norm": 0.9031971096992493, "learning_rate": 2e-05, "loss": 0.7181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3730, "tokens_per_second_per_gpu": 17673.8, "total_tokens": 368500740 }, { "epoch": 0.23324581145286322, "grad_norm": 0.9032080173492432, "learning_rate": 2e-05, "loss": 0.7021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3731, "tokens_per_second_per_gpu": 16683.43, "total_tokens": 368598033 }, { "epoch": 0.23330832708177043, "grad_norm": 0.8890791535377502, "learning_rate": 2e-05, "loss": 0.6713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3732, "tokens_per_second_per_gpu": 17368.25, "total_tokens": 368693966 }, { "epoch": 0.23337084271067768, "grad_norm": 0.9452884197235107, "learning_rate": 2e-05, "loss": 0.7026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3733, "tokens_per_second_per_gpu": 17415.33, "total_tokens": 368791953 }, { "epoch": 0.2334333583395849, "grad_norm": 0.8834416270256042, "learning_rate": 2e-05, "loss": 0.6851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3734, "tokens_per_second_per_gpu": 17062.71, "total_tokens": 368892079 }, { "epoch": 0.2334958739684921, "grad_norm": 0.9206733703613281, "learning_rate": 2e-05, "loss": 0.6993, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3735, "tokens_per_second_per_gpu": 17801.76, "total_tokens": 368993123 }, { "epoch": 0.23355838959739936, "grad_norm": 0.9390490055084229, "learning_rate": 2e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3736, "tokens_per_second_per_gpu": 15667.08, "total_tokens": 369088745 }, { "epoch": 0.23362090522630657, "grad_norm": 0.9223515391349792, "learning_rate": 2e-05, "loss": 0.6908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3737, "tokens_per_second_per_gpu": 16835.61, "total_tokens": 369187164 }, { "epoch": 0.2336834208552138, "grad_norm": 0.9046879410743713, "learning_rate": 2e-05, "loss": 0.6723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3738, "tokens_per_second_per_gpu": 16965.79, "total_tokens": 369282279 }, { "epoch": 0.23374593648412104, "grad_norm": 0.9187899827957153, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3739, "tokens_per_second_per_gpu": 17716.66, "total_tokens": 369377086 }, { "epoch": 0.23380845211302825, "grad_norm": 0.918920636177063, "learning_rate": 2e-05, "loss": 0.7086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3740, "tokens_per_second_per_gpu": 17311.08, "total_tokens": 369476767 }, { "epoch": 0.23387096774193547, "grad_norm": 0.8819624781608582, "learning_rate": 2e-05, "loss": 0.7086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3741, "tokens_per_second_per_gpu": 17936.3, "total_tokens": 369577964 }, { "epoch": 0.23393348337084272, "grad_norm": 0.9106571078300476, "learning_rate": 2e-05, "loss": 0.7165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3742, "tokens_per_second_per_gpu": 18221.03, "total_tokens": 369679551 }, { "epoch": 0.23399599899974993, "grad_norm": 0.9072432518005371, "learning_rate": 2e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3743, "tokens_per_second_per_gpu": 17289.37, "total_tokens": 369779763 }, { "epoch": 0.23405851462865718, "grad_norm": 0.9075369834899902, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3744, "tokens_per_second_per_gpu": 16761.06, "total_tokens": 369874366 }, { "epoch": 0.2341210302575644, "grad_norm": 0.8784685134887695, "learning_rate": 2e-05, "loss": 0.7353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3745, "tokens_per_second_per_gpu": 18482.08, "total_tokens": 369978575 }, { "epoch": 0.2341835458864716, "grad_norm": 0.9185670614242554, "learning_rate": 2e-05, "loss": 0.6737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3746, "tokens_per_second_per_gpu": 16348.92, "total_tokens": 370072398 }, { "epoch": 0.23424606151537886, "grad_norm": 0.9167606830596924, "learning_rate": 2e-05, "loss": 0.6643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3747, "tokens_per_second_per_gpu": 16067.09, "total_tokens": 370166339 }, { "epoch": 0.23430857714428607, "grad_norm": 0.8870593905448914, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3748, "tokens_per_second_per_gpu": 16630.47, "total_tokens": 370262616 }, { "epoch": 0.2343710927731933, "grad_norm": 0.9107251167297363, "learning_rate": 2e-05, "loss": 0.6831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3749, "tokens_per_second_per_gpu": 16642.25, "total_tokens": 370360743 }, { "epoch": 0.23443360840210054, "grad_norm": 0.9067843556404114, "learning_rate": 2e-05, "loss": 0.6809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3750, "tokens_per_second_per_gpu": 16294.35, "total_tokens": 370457217 }, { "epoch": 0.23449612403100775, "grad_norm": 0.9178541898727417, "learning_rate": 2e-05, "loss": 0.7366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3751, "tokens_per_second_per_gpu": 16334.54, "total_tokens": 370556762 }, { "epoch": 0.23455863965991497, "grad_norm": 0.8908782005310059, "learning_rate": 2e-05, "loss": 0.6602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3752, "tokens_per_second_per_gpu": 17025.3, "total_tokens": 370651583 }, { "epoch": 0.23462115528882221, "grad_norm": 0.9114781022071838, "learning_rate": 2e-05, "loss": 0.6713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3753, "tokens_per_second_per_gpu": 18094.75, "total_tokens": 370751476 }, { "epoch": 0.23468367091772943, "grad_norm": 0.9039619565010071, "learning_rate": 2e-05, "loss": 0.7075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3754, "tokens_per_second_per_gpu": 16519.91, "total_tokens": 370849023 }, { "epoch": 0.23474618654663665, "grad_norm": 0.9169074892997742, "learning_rate": 2e-05, "loss": 0.6752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3755, "tokens_per_second_per_gpu": 17379.82, "total_tokens": 370945874 }, { "epoch": 0.2348087021755439, "grad_norm": 0.9026694893836975, "learning_rate": 2e-05, "loss": 0.7303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3756, "tokens_per_second_per_gpu": 16927.11, "total_tokens": 371045274 }, { "epoch": 0.2348712178044511, "grad_norm": 0.9199141263961792, "learning_rate": 2e-05, "loss": 0.6725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3757, "tokens_per_second_per_gpu": 17683.62, "total_tokens": 371146164 }, { "epoch": 0.23493373343335833, "grad_norm": 0.8896527886390686, "learning_rate": 2e-05, "loss": 0.7024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3758, "tokens_per_second_per_gpu": 18396.55, "total_tokens": 371249648 }, { "epoch": 0.23499624906226557, "grad_norm": 0.91748046875, "learning_rate": 2e-05, "loss": 0.6756, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3759, "tokens_per_second_per_gpu": 16695.61, "total_tokens": 371344937 }, { "epoch": 0.2350587646911728, "grad_norm": 0.8891202807426453, "learning_rate": 2e-05, "loss": 0.7045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3760, "tokens_per_second_per_gpu": 18457.15, "total_tokens": 371446006 }, { "epoch": 0.23512128032008003, "grad_norm": 0.9617342352867126, "learning_rate": 2e-05, "loss": 0.6896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3761, "tokens_per_second_per_gpu": 18276.71, "total_tokens": 371552021 }, { "epoch": 0.23518379594898725, "grad_norm": 0.8995064496994019, "learning_rate": 2e-05, "loss": 0.7095, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3762, "tokens_per_second_per_gpu": 17903.77, "total_tokens": 371652686 }, { "epoch": 0.23524631157789447, "grad_norm": 0.9287435412406921, "learning_rate": 2e-05, "loss": 0.7103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3763, "tokens_per_second_per_gpu": 16632.86, "total_tokens": 371749725 }, { "epoch": 0.2353088272068017, "grad_norm": 0.9172918796539307, "learning_rate": 2e-05, "loss": 0.6735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3764, "tokens_per_second_per_gpu": 15848.8, "total_tokens": 371844560 }, { "epoch": 0.23537134283570893, "grad_norm": 0.9327632188796997, "learning_rate": 2e-05, "loss": 0.7101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3765, "tokens_per_second_per_gpu": 16325.67, "total_tokens": 371940671 }, { "epoch": 0.23543385846461615, "grad_norm": 0.8849279880523682, "learning_rate": 2e-05, "loss": 0.7011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3766, "tokens_per_second_per_gpu": 18602.09, "total_tokens": 372043853 }, { "epoch": 0.2354963740935234, "grad_norm": 0.9266764521598816, "learning_rate": 2e-05, "loss": 0.7163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3767, "tokens_per_second_per_gpu": 19069.41, "total_tokens": 372147675 }, { "epoch": 0.2355588897224306, "grad_norm": 0.904867947101593, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3768, "tokens_per_second_per_gpu": 16271.74, "total_tokens": 372244118 }, { "epoch": 0.23562140535133783, "grad_norm": 0.9057216644287109, "learning_rate": 2e-05, "loss": 0.7483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3769, "tokens_per_second_per_gpu": 18879.49, "total_tokens": 372348944 }, { "epoch": 0.23568392098024507, "grad_norm": 0.8573882579803467, "learning_rate": 2e-05, "loss": 0.6912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3770, "tokens_per_second_per_gpu": 17309.96, "total_tokens": 372448707 }, { "epoch": 0.2357464366091523, "grad_norm": 0.9066669344902039, "learning_rate": 2e-05, "loss": 0.7406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3771, "tokens_per_second_per_gpu": 17446.86, "total_tokens": 372545873 }, { "epoch": 0.2358089522380595, "grad_norm": 0.8873423933982849, "learning_rate": 2e-05, "loss": 0.7146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3772, "tokens_per_second_per_gpu": 17479.83, "total_tokens": 372649145 }, { "epoch": 0.23587146786696675, "grad_norm": 0.8861998915672302, "learning_rate": 2e-05, "loss": 0.678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3773, "tokens_per_second_per_gpu": 16589.92, "total_tokens": 372750951 }, { "epoch": 0.23593398349587397, "grad_norm": 0.930590808391571, "learning_rate": 2e-05, "loss": 0.7213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3774, "tokens_per_second_per_gpu": 16744.38, "total_tokens": 372849998 }, { "epoch": 0.23599649912478118, "grad_norm": 0.8777751326560974, "learning_rate": 2e-05, "loss": 0.6817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3775, "tokens_per_second_per_gpu": 17958.76, "total_tokens": 372949687 }, { "epoch": 0.23605901475368843, "grad_norm": 0.9163925647735596, "learning_rate": 2e-05, "loss": 0.7315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3776, "tokens_per_second_per_gpu": 17433.46, "total_tokens": 373049683 }, { "epoch": 0.23612153038259565, "grad_norm": 0.9057314991950989, "learning_rate": 2e-05, "loss": 0.7173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3777, "tokens_per_second_per_gpu": 16189.52, "total_tokens": 373146310 }, { "epoch": 0.23618404601150286, "grad_norm": 0.9093877673149109, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3778, "tokens_per_second_per_gpu": 17166.18, "total_tokens": 373242003 }, { "epoch": 0.2362465616404101, "grad_norm": 0.9180877208709717, "learning_rate": 2e-05, "loss": 0.7211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3779, "tokens_per_second_per_gpu": 17797.66, "total_tokens": 373343316 }, { "epoch": 0.23630907726931732, "grad_norm": 0.9121085405349731, "learning_rate": 2e-05, "loss": 0.6816, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3780, "tokens_per_second_per_gpu": 16198.67, "total_tokens": 373437949 }, { "epoch": 0.23637159289822457, "grad_norm": 0.9003970623016357, "learning_rate": 2e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3781, "tokens_per_second_per_gpu": 15937.88, "total_tokens": 373533223 }, { "epoch": 0.2364341085271318, "grad_norm": 0.9017162919044495, "learning_rate": 2e-05, "loss": 0.7073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3782, "tokens_per_second_per_gpu": 17900.73, "total_tokens": 373635400 }, { "epoch": 0.236496624156039, "grad_norm": 0.8622608184814453, "learning_rate": 2e-05, "loss": 0.7041, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3783, "tokens_per_second_per_gpu": 17515.15, "total_tokens": 373738240 }, { "epoch": 0.23655913978494625, "grad_norm": 0.9097528457641602, "learning_rate": 2e-05, "loss": 0.7137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3784, "tokens_per_second_per_gpu": 17808.97, "total_tokens": 373839613 }, { "epoch": 0.23662165541385347, "grad_norm": 0.8972534537315369, "learning_rate": 2e-05, "loss": 0.6928, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3785, "tokens_per_second_per_gpu": 16784.57, "total_tokens": 373940773 }, { "epoch": 0.23668417104276068, "grad_norm": 0.908900797367096, "learning_rate": 2e-05, "loss": 0.6888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3786, "tokens_per_second_per_gpu": 17665.27, "total_tokens": 374042401 }, { "epoch": 0.23674668667166793, "grad_norm": 0.9046539068222046, "learning_rate": 2e-05, "loss": 0.7005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3787, "tokens_per_second_per_gpu": 16737.3, "total_tokens": 374139619 }, { "epoch": 0.23680920230057514, "grad_norm": 0.8820765018463135, "learning_rate": 2e-05, "loss": 0.6608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3788, "tokens_per_second_per_gpu": 17171.58, "total_tokens": 374237615 }, { "epoch": 0.23687171792948236, "grad_norm": 0.8784858584403992, "learning_rate": 2e-05, "loss": 0.7047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3789, "tokens_per_second_per_gpu": 16958.68, "total_tokens": 374337459 }, { "epoch": 0.2369342335583896, "grad_norm": 0.899137020111084, "learning_rate": 2e-05, "loss": 0.7034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3790, "tokens_per_second_per_gpu": 17601.8, "total_tokens": 374440126 }, { "epoch": 0.23699674918729682, "grad_norm": 0.9160723686218262, "learning_rate": 2e-05, "loss": 0.7292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3791, "tokens_per_second_per_gpu": 17053.21, "total_tokens": 374540365 }, { "epoch": 0.23705926481620404, "grad_norm": 0.964017391204834, "learning_rate": 2e-05, "loss": 0.6983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3792, "tokens_per_second_per_gpu": 15900.91, "total_tokens": 374631065 }, { "epoch": 0.23712178044511129, "grad_norm": 0.8888193368911743, "learning_rate": 2e-05, "loss": 0.6959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3793, "tokens_per_second_per_gpu": 17045.39, "total_tokens": 374731615 }, { "epoch": 0.2371842960740185, "grad_norm": 0.8848221898078918, "learning_rate": 2e-05, "loss": 0.6724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3794, "tokens_per_second_per_gpu": 17436.34, "total_tokens": 374831886 }, { "epoch": 0.23724681170292572, "grad_norm": 0.8888022899627686, "learning_rate": 2e-05, "loss": 0.7203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3795, "tokens_per_second_per_gpu": 18113.01, "total_tokens": 374936951 }, { "epoch": 0.23730932733183296, "grad_norm": 0.9024326801300049, "learning_rate": 2e-05, "loss": 0.6884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3796, "tokens_per_second_per_gpu": 17753.08, "total_tokens": 375039328 }, { "epoch": 0.23737184296074018, "grad_norm": 0.8905428647994995, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3797, "tokens_per_second_per_gpu": 16618.88, "total_tokens": 375136202 }, { "epoch": 0.2374343585896474, "grad_norm": 0.9422535300254822, "learning_rate": 2e-05, "loss": 0.693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3798, "tokens_per_second_per_gpu": 16829.84, "total_tokens": 375231748 }, { "epoch": 0.23749687421855464, "grad_norm": 0.9436564445495605, "learning_rate": 2e-05, "loss": 0.6975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3799, "tokens_per_second_per_gpu": 17607.57, "total_tokens": 375331963 }, { "epoch": 0.23755938984746186, "grad_norm": 0.9194092750549316, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3800, "tokens_per_second_per_gpu": 17102.99, "total_tokens": 375430798 }, { "epoch": 0.2376219054763691, "grad_norm": 0.8804375529289246, "learning_rate": 2e-05, "loss": 0.6747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3801, "tokens_per_second_per_gpu": 17664.39, "total_tokens": 375529105 }, { "epoch": 0.23768442110527632, "grad_norm": 0.8897290825843811, "learning_rate": 2e-05, "loss": 0.7054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3802, "tokens_per_second_per_gpu": 17608.72, "total_tokens": 375631629 }, { "epoch": 0.23774693673418354, "grad_norm": 0.9188511967658997, "learning_rate": 2e-05, "loss": 0.6983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3803, "tokens_per_second_per_gpu": 17265.89, "total_tokens": 375727857 }, { "epoch": 0.23780945236309078, "grad_norm": 0.9634436964988708, "learning_rate": 2e-05, "loss": 0.6824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3804, "tokens_per_second_per_gpu": 16501.85, "total_tokens": 375820629 }, { "epoch": 0.237871967991998, "grad_norm": 0.8552356958389282, "learning_rate": 2e-05, "loss": 0.6898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3805, "tokens_per_second_per_gpu": 17763.54, "total_tokens": 375925536 }, { "epoch": 0.23793448362090522, "grad_norm": 0.9212337732315063, "learning_rate": 2e-05, "loss": 0.7416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3806, "tokens_per_second_per_gpu": 17747.67, "total_tokens": 376023072 }, { "epoch": 0.23799699924981246, "grad_norm": 0.9438738822937012, "learning_rate": 2e-05, "loss": 0.7057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3807, "tokens_per_second_per_gpu": 17836.87, "total_tokens": 376122848 }, { "epoch": 0.23805951487871968, "grad_norm": 0.8713642954826355, "learning_rate": 2e-05, "loss": 0.6699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3808, "tokens_per_second_per_gpu": 16953.3, "total_tokens": 376220165 }, { "epoch": 0.2381220305076269, "grad_norm": 1.0312750339508057, "learning_rate": 2e-05, "loss": 0.6869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3809, "tokens_per_second_per_gpu": 16762.11, "total_tokens": 376320851 }, { "epoch": 0.23818454613653414, "grad_norm": 0.9083718061447144, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3810, "tokens_per_second_per_gpu": 17898.12, "total_tokens": 376420804 }, { "epoch": 0.23824706176544136, "grad_norm": 0.9175015687942505, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3811, "tokens_per_second_per_gpu": 15492.41, "total_tokens": 376511444 }, { "epoch": 0.23830957739434858, "grad_norm": 0.9322772026062012, "learning_rate": 2e-05, "loss": 0.7261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3812, "tokens_per_second_per_gpu": 17346.69, "total_tokens": 376610141 }, { "epoch": 0.23837209302325582, "grad_norm": 0.8850277066230774, "learning_rate": 2e-05, "loss": 0.6733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3813, "tokens_per_second_per_gpu": 17838.01, "total_tokens": 376708727 }, { "epoch": 0.23843460865216304, "grad_norm": 0.8977729082107544, "learning_rate": 2e-05, "loss": 0.7047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3814, "tokens_per_second_per_gpu": 17255.54, "total_tokens": 376808432 }, { "epoch": 0.23849712428107026, "grad_norm": 0.8522533774375916, "learning_rate": 2e-05, "loss": 0.6709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3815, "tokens_per_second_per_gpu": 18118.17, "total_tokens": 376910468 }, { "epoch": 0.2385596399099775, "grad_norm": 0.9152560234069824, "learning_rate": 2e-05, "loss": 0.7316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3816, "tokens_per_second_per_gpu": 16305.9, "total_tokens": 377007960 }, { "epoch": 0.23862215553888472, "grad_norm": 0.9280030727386475, "learning_rate": 2e-05, "loss": 0.7168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3817, "tokens_per_second_per_gpu": 18475.16, "total_tokens": 377106796 }, { "epoch": 0.23868467116779193, "grad_norm": 0.8912546634674072, "learning_rate": 2e-05, "loss": 0.724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3818, "tokens_per_second_per_gpu": 17575.99, "total_tokens": 377207657 }, { "epoch": 0.23874718679669918, "grad_norm": 0.8911285996437073, "learning_rate": 2e-05, "loss": 0.7089, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3819, "tokens_per_second_per_gpu": 18160.24, "total_tokens": 377310249 }, { "epoch": 0.2388097024256064, "grad_norm": 0.8879356384277344, "learning_rate": 2e-05, "loss": 0.6992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3820, "tokens_per_second_per_gpu": 18182.51, "total_tokens": 377415362 }, { "epoch": 0.23887221805451364, "grad_norm": 0.9199831485748291, "learning_rate": 2e-05, "loss": 0.7169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3821, "tokens_per_second_per_gpu": 18443.91, "total_tokens": 377518059 }, { "epoch": 0.23893473368342086, "grad_norm": 0.9042055010795593, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3822, "tokens_per_second_per_gpu": 16489.31, "total_tokens": 377613636 }, { "epoch": 0.23899724931232808, "grad_norm": 0.8689868450164795, "learning_rate": 2e-05, "loss": 0.6939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3823, "tokens_per_second_per_gpu": 17864.21, "total_tokens": 377714813 }, { "epoch": 0.23905976494123532, "grad_norm": 0.8682155609130859, "learning_rate": 2e-05, "loss": 0.6685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3824, "tokens_per_second_per_gpu": 17705.84, "total_tokens": 377817037 }, { "epoch": 0.23912228057014254, "grad_norm": 0.907200038433075, "learning_rate": 2e-05, "loss": 0.6902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3825, "tokens_per_second_per_gpu": 17393.35, "total_tokens": 377915229 }, { "epoch": 0.23918479619904975, "grad_norm": 0.8889797329902649, "learning_rate": 2e-05, "loss": 0.6812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3826, "tokens_per_second_per_gpu": 17469.91, "total_tokens": 378015845 }, { "epoch": 0.239247311827957, "grad_norm": 0.8659828901290894, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3827, "tokens_per_second_per_gpu": 18642.88, "total_tokens": 378119163 }, { "epoch": 0.23930982745686422, "grad_norm": 0.9101812839508057, "learning_rate": 2e-05, "loss": 0.7266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3828, "tokens_per_second_per_gpu": 18336.23, "total_tokens": 378219087 }, { "epoch": 0.23937234308577143, "grad_norm": 0.913473904132843, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3829, "tokens_per_second_per_gpu": 16653.57, "total_tokens": 378318129 }, { "epoch": 0.23943485871467868, "grad_norm": 0.8660212755203247, "learning_rate": 2e-05, "loss": 0.6685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3830, "tokens_per_second_per_gpu": 18084.6, "total_tokens": 378419436 }, { "epoch": 0.2394973743435859, "grad_norm": 0.9276092052459717, "learning_rate": 2e-05, "loss": 0.7352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3831, "tokens_per_second_per_gpu": 17413.52, "total_tokens": 378518273 }, { "epoch": 0.2395598899724931, "grad_norm": 0.9256516098976135, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3832, "tokens_per_second_per_gpu": 16948.51, "total_tokens": 378610681 }, { "epoch": 0.23962240560140036, "grad_norm": 0.8609889149665833, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3833, "tokens_per_second_per_gpu": 17288.17, "total_tokens": 378711745 }, { "epoch": 0.23968492123030757, "grad_norm": 0.8704242706298828, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3834, "tokens_per_second_per_gpu": 16200.71, "total_tokens": 378809636 }, { "epoch": 0.2397474368592148, "grad_norm": 0.8988168239593506, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3835, "tokens_per_second_per_gpu": 16735.98, "total_tokens": 378905925 }, { "epoch": 0.23980995248812204, "grad_norm": 0.8632112145423889, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3836, "tokens_per_second_per_gpu": 16734.48, "total_tokens": 379003451 }, { "epoch": 0.23987246811702925, "grad_norm": 0.912054181098938, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3837, "tokens_per_second_per_gpu": 16119.57, "total_tokens": 379096373 }, { "epoch": 0.2399349837459365, "grad_norm": 0.9080297350883484, "learning_rate": 2e-05, "loss": 0.6925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3838, "tokens_per_second_per_gpu": 19284.8, "total_tokens": 379200930 }, { "epoch": 0.23999749937484371, "grad_norm": 0.91374272108078, "learning_rate": 2e-05, "loss": 0.7085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3839, "tokens_per_second_per_gpu": 17156.16, "total_tokens": 379300134 }, { "epoch": 0.24006001500375093, "grad_norm": 0.9462574124336243, "learning_rate": 2e-05, "loss": 0.7113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3840, "tokens_per_second_per_gpu": 17799.18, "total_tokens": 379397234 }, { "epoch": 0.24012253063265818, "grad_norm": 0.9158399105072021, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3841, "tokens_per_second_per_gpu": 15222.53, "total_tokens": 379490152 }, { "epoch": 0.2401850462615654, "grad_norm": 0.9158241152763367, "learning_rate": 2e-05, "loss": 0.7226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3842, "tokens_per_second_per_gpu": 18225.11, "total_tokens": 379592630 }, { "epoch": 0.2402475618904726, "grad_norm": 0.9126071333885193, "learning_rate": 2e-05, "loss": 0.6864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3843, "tokens_per_second_per_gpu": 17247.94, "total_tokens": 379689350 }, { "epoch": 0.24031007751937986, "grad_norm": 0.9336732029914856, "learning_rate": 2e-05, "loss": 0.7267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3844, "tokens_per_second_per_gpu": 17276.88, "total_tokens": 379790258 }, { "epoch": 0.24037259314828707, "grad_norm": 0.9181432127952576, "learning_rate": 2e-05, "loss": 0.6639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3845, "tokens_per_second_per_gpu": 15959.31, "total_tokens": 379882496 }, { "epoch": 0.2404351087771943, "grad_norm": 0.9217442870140076, "learning_rate": 2e-05, "loss": 0.7299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3846, "tokens_per_second_per_gpu": 17151.21, "total_tokens": 379981410 }, { "epoch": 0.24049762440610153, "grad_norm": 0.9303194284439087, "learning_rate": 2e-05, "loss": 0.6916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3847, "tokens_per_second_per_gpu": 16448.01, "total_tokens": 380076410 }, { "epoch": 0.24056014003500875, "grad_norm": 0.8719265460968018, "learning_rate": 2e-05, "loss": 0.6769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3848, "tokens_per_second_per_gpu": 17861.5, "total_tokens": 380177429 }, { "epoch": 0.24062265566391597, "grad_norm": 0.9125577807426453, "learning_rate": 2e-05, "loss": 0.7026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3849, "tokens_per_second_per_gpu": 17956.49, "total_tokens": 380276348 }, { "epoch": 0.2406851712928232, "grad_norm": 0.9014571309089661, "learning_rate": 2e-05, "loss": 0.6656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3850, "tokens_per_second_per_gpu": 16641.32, "total_tokens": 380373535 }, { "epoch": 0.24074768692173043, "grad_norm": 0.9511159658432007, "learning_rate": 2e-05, "loss": 0.6867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3851, "tokens_per_second_per_gpu": 18440.01, "total_tokens": 380473186 }, { "epoch": 0.24081020255063765, "grad_norm": 0.8936749696731567, "learning_rate": 2e-05, "loss": 0.7096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3852, "tokens_per_second_per_gpu": 16895.82, "total_tokens": 380572786 }, { "epoch": 0.2408727181795449, "grad_norm": 0.8895410299301147, "learning_rate": 2e-05, "loss": 0.6644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3853, "tokens_per_second_per_gpu": 17030.94, "total_tokens": 380672626 }, { "epoch": 0.2409352338084521, "grad_norm": 0.8832585215568542, "learning_rate": 2e-05, "loss": 0.71, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3854, "tokens_per_second_per_gpu": 17878.99, "total_tokens": 380773455 }, { "epoch": 0.24099774943735933, "grad_norm": 0.9169679880142212, "learning_rate": 2e-05, "loss": 0.7225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3855, "tokens_per_second_per_gpu": 17971.51, "total_tokens": 380873987 }, { "epoch": 0.24106026506626657, "grad_norm": 0.8966162204742432, "learning_rate": 2e-05, "loss": 0.7074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3856, "tokens_per_second_per_gpu": 18931.39, "total_tokens": 380977150 }, { "epoch": 0.2411227806951738, "grad_norm": 0.8900396227836609, "learning_rate": 2e-05, "loss": 0.709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3857, "tokens_per_second_per_gpu": 17571.85, "total_tokens": 381077023 }, { "epoch": 0.24118529632408103, "grad_norm": 0.924573540687561, "learning_rate": 2e-05, "loss": 0.7006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3858, "tokens_per_second_per_gpu": 18152.38, "total_tokens": 381178212 }, { "epoch": 0.24124781195298825, "grad_norm": 0.9003538489341736, "learning_rate": 2e-05, "loss": 0.6592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3859, "tokens_per_second_per_gpu": 18122.64, "total_tokens": 381277758 }, { "epoch": 0.24131032758189547, "grad_norm": 0.9084029793739319, "learning_rate": 2e-05, "loss": 0.6859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3860, "tokens_per_second_per_gpu": 17535.79, "total_tokens": 381373135 }, { "epoch": 0.2413728432108027, "grad_norm": 0.92169189453125, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3861, "tokens_per_second_per_gpu": 16604.85, "total_tokens": 381466267 }, { "epoch": 0.24143535883970993, "grad_norm": 0.9082900881767273, "learning_rate": 2e-05, "loss": 0.7489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3862, "tokens_per_second_per_gpu": 17432.35, "total_tokens": 381565359 }, { "epoch": 0.24149787446861715, "grad_norm": 0.9179226160049438, "learning_rate": 2e-05, "loss": 0.7131, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3863, "tokens_per_second_per_gpu": 17293.56, "total_tokens": 381661434 }, { "epoch": 0.2415603900975244, "grad_norm": 0.8882834315299988, "learning_rate": 2e-05, "loss": 0.6752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3864, "tokens_per_second_per_gpu": 17213.94, "total_tokens": 381758472 }, { "epoch": 0.2416229057264316, "grad_norm": 0.9001519680023193, "learning_rate": 2e-05, "loss": 0.7259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3865, "tokens_per_second_per_gpu": 17899.86, "total_tokens": 381854940 }, { "epoch": 0.24168542135533883, "grad_norm": 0.8743663430213928, "learning_rate": 2e-05, "loss": 0.673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3866, "tokens_per_second_per_gpu": 17437.87, "total_tokens": 381956974 }, { "epoch": 0.24174793698424607, "grad_norm": 0.8831800818443298, "learning_rate": 2e-05, "loss": 0.6701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3867, "tokens_per_second_per_gpu": 16786.04, "total_tokens": 382054752 }, { "epoch": 0.2418104526131533, "grad_norm": 0.9058486223220825, "learning_rate": 2e-05, "loss": 0.7006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3868, "tokens_per_second_per_gpu": 17793.21, "total_tokens": 382153068 }, { "epoch": 0.2418729682420605, "grad_norm": 0.8818197250366211, "learning_rate": 2e-05, "loss": 0.6994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3869, "tokens_per_second_per_gpu": 16955.53, "total_tokens": 382250404 }, { "epoch": 0.24193548387096775, "grad_norm": 0.8767706751823425, "learning_rate": 2e-05, "loss": 0.6855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3870, "tokens_per_second_per_gpu": 17928.82, "total_tokens": 382352733 }, { "epoch": 0.24199799949987497, "grad_norm": 0.8851787447929382, "learning_rate": 2e-05, "loss": 0.6763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3871, "tokens_per_second_per_gpu": 17252.12, "total_tokens": 382448778 }, { "epoch": 0.24206051512878218, "grad_norm": 0.9035895466804504, "learning_rate": 2e-05, "loss": 0.7039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3872, "tokens_per_second_per_gpu": 17857.26, "total_tokens": 382549128 }, { "epoch": 0.24212303075768943, "grad_norm": 0.9003928899765015, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3873, "tokens_per_second_per_gpu": 17327.01, "total_tokens": 382646539 }, { "epoch": 0.24218554638659665, "grad_norm": 0.8877823352813721, "learning_rate": 2e-05, "loss": 0.7006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3874, "tokens_per_second_per_gpu": 18627.57, "total_tokens": 382749073 }, { "epoch": 0.24224806201550386, "grad_norm": 0.8989062309265137, "learning_rate": 2e-05, "loss": 0.7082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3875, "tokens_per_second_per_gpu": 17484.22, "total_tokens": 382847441 }, { "epoch": 0.2423105776444111, "grad_norm": 0.8988809585571289, "learning_rate": 2e-05, "loss": 0.7021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3876, "tokens_per_second_per_gpu": 17142.66, "total_tokens": 382948576 }, { "epoch": 0.24237309327331832, "grad_norm": 0.9212620854377747, "learning_rate": 2e-05, "loss": 0.693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3877, "tokens_per_second_per_gpu": 18171.72, "total_tokens": 383049642 }, { "epoch": 0.24243560890222557, "grad_norm": 0.8764498233795166, "learning_rate": 2e-05, "loss": 0.7108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3878, "tokens_per_second_per_gpu": 17788.8, "total_tokens": 383153420 }, { "epoch": 0.24249812453113279, "grad_norm": 0.9030769467353821, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3879, "tokens_per_second_per_gpu": 16307.23, "total_tokens": 383249131 }, { "epoch": 0.24256064016004, "grad_norm": 0.9293103218078613, "learning_rate": 2e-05, "loss": 0.6671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3880, "tokens_per_second_per_gpu": 16518.4, "total_tokens": 383342092 }, { "epoch": 0.24262315578894725, "grad_norm": 0.9215378761291504, "learning_rate": 2e-05, "loss": 0.7014, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3881, "tokens_per_second_per_gpu": 18012.96, "total_tokens": 383442880 }, { "epoch": 0.24268567141785446, "grad_norm": 0.9438020586967468, "learning_rate": 2e-05, "loss": 0.7204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3882, "tokens_per_second_per_gpu": 16804.42, "total_tokens": 383540890 }, { "epoch": 0.24274818704676168, "grad_norm": 0.9129644632339478, "learning_rate": 2e-05, "loss": 0.6861, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3883, "tokens_per_second_per_gpu": 17515.4, "total_tokens": 383638277 }, { "epoch": 0.24281070267566893, "grad_norm": 0.8570718765258789, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3884, "tokens_per_second_per_gpu": 17132.32, "total_tokens": 383737304 }, { "epoch": 0.24287321830457614, "grad_norm": 0.8828704357147217, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3885, "tokens_per_second_per_gpu": 16478.52, "total_tokens": 383835228 }, { "epoch": 0.24293573393348336, "grad_norm": 0.9178020358085632, "learning_rate": 2e-05, "loss": 0.6917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3886, "tokens_per_second_per_gpu": 16891.15, "total_tokens": 383935675 }, { "epoch": 0.2429982495623906, "grad_norm": 0.9243149161338806, "learning_rate": 2e-05, "loss": 0.6968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3887, "tokens_per_second_per_gpu": 16992.35, "total_tokens": 384031709 }, { "epoch": 0.24306076519129782, "grad_norm": 0.9099230766296387, "learning_rate": 2e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3888, "tokens_per_second_per_gpu": 18073.67, "total_tokens": 384129537 }, { "epoch": 0.24312328082020504, "grad_norm": 0.8987873792648315, "learning_rate": 2e-05, "loss": 0.734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3889, "tokens_per_second_per_gpu": 17754.49, "total_tokens": 384232768 }, { "epoch": 0.24318579644911228, "grad_norm": 1.119376540184021, "learning_rate": 2e-05, "loss": 0.7675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3890, "tokens_per_second_per_gpu": 17529.7, "total_tokens": 384332673 }, { "epoch": 0.2432483120780195, "grad_norm": 0.9241470694541931, "learning_rate": 2e-05, "loss": 0.7176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3891, "tokens_per_second_per_gpu": 17705.23, "total_tokens": 384431120 }, { "epoch": 0.24331082770692672, "grad_norm": 0.9325632452964783, "learning_rate": 2e-05, "loss": 0.7086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3892, "tokens_per_second_per_gpu": 17436.49, "total_tokens": 384528117 }, { "epoch": 0.24337334333583396, "grad_norm": 0.8721106648445129, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3893, "tokens_per_second_per_gpu": 17318.35, "total_tokens": 384629080 }, { "epoch": 0.24343585896474118, "grad_norm": 0.9387005567550659, "learning_rate": 2e-05, "loss": 0.677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3894, "tokens_per_second_per_gpu": 17293.62, "total_tokens": 384729786 }, { "epoch": 0.24349837459364843, "grad_norm": 0.9045098423957825, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3895, "tokens_per_second_per_gpu": 17426.35, "total_tokens": 384827136 }, { "epoch": 0.24356089022255564, "grad_norm": 0.8943234086036682, "learning_rate": 2e-05, "loss": 0.6785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3896, "tokens_per_second_per_gpu": 16942.5, "total_tokens": 384926974 }, { "epoch": 0.24362340585146286, "grad_norm": 0.9946145415306091, "learning_rate": 2e-05, "loss": 0.7046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3897, "tokens_per_second_per_gpu": 17174.65, "total_tokens": 385024152 }, { "epoch": 0.2436859214803701, "grad_norm": 0.9153541326522827, "learning_rate": 2e-05, "loss": 0.7073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3898, "tokens_per_second_per_gpu": 18165.37, "total_tokens": 385126149 }, { "epoch": 0.24374843710927732, "grad_norm": 0.8989908695220947, "learning_rate": 2e-05, "loss": 0.6893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3899, "tokens_per_second_per_gpu": 18188.88, "total_tokens": 385230024 }, { "epoch": 0.24381095273818454, "grad_norm": 0.8807207345962524, "learning_rate": 2e-05, "loss": 0.725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3900, "tokens_per_second_per_gpu": 17636.88, "total_tokens": 385331534 }, { "epoch": 0.24387346836709178, "grad_norm": 0.9184626340866089, "learning_rate": 2e-05, "loss": 0.6703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3901, "tokens_per_second_per_gpu": 17948.96, "total_tokens": 385431426 }, { "epoch": 0.243935983995999, "grad_norm": 0.8759437799453735, "learning_rate": 2e-05, "loss": 0.6582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3902, "tokens_per_second_per_gpu": 17620.31, "total_tokens": 385532535 }, { "epoch": 0.24399849962490622, "grad_norm": 0.915147602558136, "learning_rate": 2e-05, "loss": 0.7291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3903, "tokens_per_second_per_gpu": 16915.83, "total_tokens": 385632928 }, { "epoch": 0.24406101525381346, "grad_norm": 0.8944460153579712, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3904, "tokens_per_second_per_gpu": 17030.28, "total_tokens": 385730112 }, { "epoch": 0.24412353088272068, "grad_norm": 0.9101940393447876, "learning_rate": 2e-05, "loss": 0.6925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3905, "tokens_per_second_per_gpu": 17970.86, "total_tokens": 385832087 }, { "epoch": 0.2441860465116279, "grad_norm": 1.836445689201355, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3906, "tokens_per_second_per_gpu": 15711.26, "total_tokens": 385927318 }, { "epoch": 0.24424856214053514, "grad_norm": 0.8642868995666504, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3907, "tokens_per_second_per_gpu": 17740.16, "total_tokens": 386028424 }, { "epoch": 0.24431107776944236, "grad_norm": 0.9241451025009155, "learning_rate": 2e-05, "loss": 0.682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3908, "tokens_per_second_per_gpu": 16755.6, "total_tokens": 386125528 }, { "epoch": 0.24437359339834958, "grad_norm": 0.9142568111419678, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3909, "tokens_per_second_per_gpu": 16196.96, "total_tokens": 386221119 }, { "epoch": 0.24443610902725682, "grad_norm": 0.9345428943634033, "learning_rate": 2e-05, "loss": 0.6857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3910, "tokens_per_second_per_gpu": 17620.45, "total_tokens": 386318774 }, { "epoch": 0.24449862465616404, "grad_norm": 0.9449105262756348, "learning_rate": 2e-05, "loss": 0.7073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3911, "tokens_per_second_per_gpu": 17472.64, "total_tokens": 386417189 }, { "epoch": 0.24456114028507125, "grad_norm": 0.9108220934867859, "learning_rate": 2e-05, "loss": 0.6998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3912, "tokens_per_second_per_gpu": 17125.79, "total_tokens": 386516078 }, { "epoch": 0.2446236559139785, "grad_norm": 0.9026102423667908, "learning_rate": 2e-05, "loss": 0.6818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3913, "tokens_per_second_per_gpu": 17268.5, "total_tokens": 386615749 }, { "epoch": 0.24468617154288572, "grad_norm": 0.9549968838691711, "learning_rate": 2e-05, "loss": 0.7418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3914, "tokens_per_second_per_gpu": 18213.21, "total_tokens": 386721096 }, { "epoch": 0.24474868717179296, "grad_norm": 0.9311747550964355, "learning_rate": 2e-05, "loss": 0.7366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3915, "tokens_per_second_per_gpu": 18182.86, "total_tokens": 386821941 }, { "epoch": 0.24481120280070018, "grad_norm": 0.9068348407745361, "learning_rate": 2e-05, "loss": 0.6982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3916, "tokens_per_second_per_gpu": 17326.11, "total_tokens": 386922831 }, { "epoch": 0.2448737184296074, "grad_norm": 0.8929295539855957, "learning_rate": 2e-05, "loss": 0.7026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3917, "tokens_per_second_per_gpu": 17855.96, "total_tokens": 387021342 }, { "epoch": 0.24493623405851464, "grad_norm": 0.9228713512420654, "learning_rate": 2e-05, "loss": 0.7132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3918, "tokens_per_second_per_gpu": 17261.48, "total_tokens": 387119113 }, { "epoch": 0.24499874968742186, "grad_norm": 0.9175050258636475, "learning_rate": 2e-05, "loss": 0.6895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3919, "tokens_per_second_per_gpu": 17566.17, "total_tokens": 387217218 }, { "epoch": 0.24506126531632907, "grad_norm": 0.9207726120948792, "learning_rate": 2e-05, "loss": 0.6745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3920, "tokens_per_second_per_gpu": 15915.87, "total_tokens": 387312837 }, { "epoch": 0.24512378094523632, "grad_norm": 0.9226760864257812, "learning_rate": 2e-05, "loss": 0.6607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3921, "tokens_per_second_per_gpu": 16760.99, "total_tokens": 387413251 }, { "epoch": 0.24518629657414354, "grad_norm": 0.9085695743560791, "learning_rate": 2e-05, "loss": 0.6683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3922, "tokens_per_second_per_gpu": 17087.05, "total_tokens": 387511095 }, { "epoch": 0.24524881220305075, "grad_norm": 0.8865258097648621, "learning_rate": 2e-05, "loss": 0.6907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3923, "tokens_per_second_per_gpu": 17953.74, "total_tokens": 387611494 }, { "epoch": 0.245311327831958, "grad_norm": 0.9170107841491699, "learning_rate": 2e-05, "loss": 0.7026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3924, "tokens_per_second_per_gpu": 17105.58, "total_tokens": 387712294 }, { "epoch": 0.24537384346086522, "grad_norm": 0.9063679575920105, "learning_rate": 2e-05, "loss": 0.7037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3925, "tokens_per_second_per_gpu": 18535.54, "total_tokens": 387814280 }, { "epoch": 0.24543635908977243, "grad_norm": 0.8934570550918579, "learning_rate": 2e-05, "loss": 0.6815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3926, "tokens_per_second_per_gpu": 16643.91, "total_tokens": 387910993 }, { "epoch": 0.24549887471867968, "grad_norm": 0.9597575664520264, "learning_rate": 2e-05, "loss": 0.7058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3927, "tokens_per_second_per_gpu": 18002.33, "total_tokens": 388012179 }, { "epoch": 0.2455613903475869, "grad_norm": 0.9496362209320068, "learning_rate": 2e-05, "loss": 0.6924, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3928, "tokens_per_second_per_gpu": 17119.51, "total_tokens": 388108501 }, { "epoch": 0.2456239059764941, "grad_norm": 0.915999710559845, "learning_rate": 2e-05, "loss": 0.7084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3929, "tokens_per_second_per_gpu": 17938.39, "total_tokens": 388208610 }, { "epoch": 0.24568642160540136, "grad_norm": 0.9182618856430054, "learning_rate": 2e-05, "loss": 0.7039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3930, "tokens_per_second_per_gpu": 17267.75, "total_tokens": 388311984 }, { "epoch": 0.24574893723430857, "grad_norm": 0.9132155179977417, "learning_rate": 2e-05, "loss": 0.6883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3931, "tokens_per_second_per_gpu": 16825.61, "total_tokens": 388409463 }, { "epoch": 0.2458114528632158, "grad_norm": 0.910070538520813, "learning_rate": 2e-05, "loss": 0.6935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3932, "tokens_per_second_per_gpu": 18264.41, "total_tokens": 388507912 }, { "epoch": 0.24587396849212304, "grad_norm": 0.9242305755615234, "learning_rate": 2e-05, "loss": 0.708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3933, "tokens_per_second_per_gpu": 17730.15, "total_tokens": 388608758 }, { "epoch": 0.24593648412103025, "grad_norm": 0.9120533466339111, "learning_rate": 2e-05, "loss": 0.6861, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3934, "tokens_per_second_per_gpu": 17155.95, "total_tokens": 388707984 }, { "epoch": 0.2459989997499375, "grad_norm": 0.9197208881378174, "learning_rate": 2e-05, "loss": 0.6918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3935, "tokens_per_second_per_gpu": 16232.46, "total_tokens": 388805188 }, { "epoch": 0.24606151537884471, "grad_norm": 0.9185624122619629, "learning_rate": 2e-05, "loss": 0.7269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3936, "tokens_per_second_per_gpu": 17143.32, "total_tokens": 388903260 }, { "epoch": 0.24612403100775193, "grad_norm": 0.91655033826828, "learning_rate": 2e-05, "loss": 0.6886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3937, "tokens_per_second_per_gpu": 16150.84, "total_tokens": 389000986 }, { "epoch": 0.24618654663665918, "grad_norm": 0.8848457932472229, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3938, "tokens_per_second_per_gpu": 16298.36, "total_tokens": 389095908 }, { "epoch": 0.2462490622655664, "grad_norm": 0.9035855531692505, "learning_rate": 2e-05, "loss": 0.7055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3939, "tokens_per_second_per_gpu": 17471.33, "total_tokens": 389195085 }, { "epoch": 0.2463115778944736, "grad_norm": 0.8966021537780762, "learning_rate": 2e-05, "loss": 0.7145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3940, "tokens_per_second_per_gpu": 16727.71, "total_tokens": 389293072 }, { "epoch": 0.24637409352338085, "grad_norm": 0.9176414608955383, "learning_rate": 2e-05, "loss": 0.6973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3941, "tokens_per_second_per_gpu": 18096.58, "total_tokens": 389395063 }, { "epoch": 0.24643660915228807, "grad_norm": 0.9273091554641724, "learning_rate": 2e-05, "loss": 0.7057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3942, "tokens_per_second_per_gpu": 18592.4, "total_tokens": 389493750 }, { "epoch": 0.2464991247811953, "grad_norm": 0.9023796916007996, "learning_rate": 2e-05, "loss": 0.7049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3943, "tokens_per_second_per_gpu": 17994.91, "total_tokens": 389592962 }, { "epoch": 0.24656164041010253, "grad_norm": 0.9323318004608154, "learning_rate": 2e-05, "loss": 0.7138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3944, "tokens_per_second_per_gpu": 17607.34, "total_tokens": 389693301 }, { "epoch": 0.24662415603900975, "grad_norm": 0.9373779296875, "learning_rate": 2e-05, "loss": 0.6944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3945, "tokens_per_second_per_gpu": 17643.74, "total_tokens": 389791730 }, { "epoch": 0.24668667166791697, "grad_norm": 0.8955006003379822, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3946, "tokens_per_second_per_gpu": 17140.43, "total_tokens": 389886170 }, { "epoch": 0.2467491872968242, "grad_norm": 0.9204674363136292, "learning_rate": 2e-05, "loss": 0.7112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3947, "tokens_per_second_per_gpu": 17055.54, "total_tokens": 389981016 }, { "epoch": 0.24681170292573143, "grad_norm": 0.8819443583488464, "learning_rate": 2e-05, "loss": 0.7133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3948, "tokens_per_second_per_gpu": 17171.42, "total_tokens": 390079054 }, { "epoch": 0.24687421855463865, "grad_norm": 0.896485447883606, "learning_rate": 2e-05, "loss": 0.6902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3949, "tokens_per_second_per_gpu": 16673.81, "total_tokens": 390177228 }, { "epoch": 0.2469367341835459, "grad_norm": 0.871767520904541, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3950, "tokens_per_second_per_gpu": 16155.7, "total_tokens": 390273632 }, { "epoch": 0.2469992498124531, "grad_norm": 0.923757016658783, "learning_rate": 2e-05, "loss": 0.6831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3951, "tokens_per_second_per_gpu": 16673.03, "total_tokens": 390367505 }, { "epoch": 0.24706176544136035, "grad_norm": 0.9565576910972595, "learning_rate": 2e-05, "loss": 0.7233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3952, "tokens_per_second_per_gpu": 16538.96, "total_tokens": 390460629 }, { "epoch": 0.24712428107026757, "grad_norm": 0.9190022349357605, "learning_rate": 2e-05, "loss": 0.6835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3953, "tokens_per_second_per_gpu": 17360.52, "total_tokens": 390559729 }, { "epoch": 0.2471867966991748, "grad_norm": 0.9008317589759827, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3954, "tokens_per_second_per_gpu": 16584.22, "total_tokens": 390656471 }, { "epoch": 0.24724931232808203, "grad_norm": 0.9147392511367798, "learning_rate": 2e-05, "loss": 0.6807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3955, "tokens_per_second_per_gpu": 16907.67, "total_tokens": 390757237 }, { "epoch": 0.24731182795698925, "grad_norm": 0.9063700437545776, "learning_rate": 2e-05, "loss": 0.706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3956, "tokens_per_second_per_gpu": 17827.0, "total_tokens": 390854321 }, { "epoch": 0.24737434358589647, "grad_norm": 0.9345539808273315, "learning_rate": 2e-05, "loss": 0.7077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3957, "tokens_per_second_per_gpu": 16999.06, "total_tokens": 390952437 }, { "epoch": 0.2474368592148037, "grad_norm": 0.9479132890701294, "learning_rate": 2e-05, "loss": 0.6851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3958, "tokens_per_second_per_gpu": 15904.33, "total_tokens": 391044828 }, { "epoch": 0.24749937484371093, "grad_norm": 0.9059312343597412, "learning_rate": 2e-05, "loss": 0.6795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3959, "tokens_per_second_per_gpu": 18733.19, "total_tokens": 391146819 }, { "epoch": 0.24756189047261815, "grad_norm": 0.9893911480903625, "learning_rate": 2e-05, "loss": 0.6922, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3960, "tokens_per_second_per_gpu": 16928.82, "total_tokens": 391242346 }, { "epoch": 0.2476244061015254, "grad_norm": 0.9323728680610657, "learning_rate": 2e-05, "loss": 0.7082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3961, "tokens_per_second_per_gpu": 16018.74, "total_tokens": 391340097 }, { "epoch": 0.2476869217304326, "grad_norm": 0.9617939591407776, "learning_rate": 2e-05, "loss": 0.7199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3962, "tokens_per_second_per_gpu": 16957.88, "total_tokens": 391439904 }, { "epoch": 0.24774943735933982, "grad_norm": 0.9383141994476318, "learning_rate": 2e-05, "loss": 0.6738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3963, "tokens_per_second_per_gpu": 16122.46, "total_tokens": 391536384 }, { "epoch": 0.24781195298824707, "grad_norm": 0.9465631246566772, "learning_rate": 2e-05, "loss": 0.6713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3964, "tokens_per_second_per_gpu": 16883.57, "total_tokens": 391633971 }, { "epoch": 0.2478744686171543, "grad_norm": 0.9001397490501404, "learning_rate": 2e-05, "loss": 0.6851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3965, "tokens_per_second_per_gpu": 18167.67, "total_tokens": 391737975 }, { "epoch": 0.2479369842460615, "grad_norm": 0.9148541688919067, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3966, "tokens_per_second_per_gpu": 16788.7, "total_tokens": 391833843 }, { "epoch": 0.24799949987496875, "grad_norm": 0.9452284574508667, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3967, "tokens_per_second_per_gpu": 15466.79, "total_tokens": 391926004 }, { "epoch": 0.24806201550387597, "grad_norm": 0.9130239486694336, "learning_rate": 2e-05, "loss": 0.6906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3968, "tokens_per_second_per_gpu": 17509.15, "total_tokens": 392026007 }, { "epoch": 0.24812453113278318, "grad_norm": 0.9158395528793335, "learning_rate": 2e-05, "loss": 0.677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3969, "tokens_per_second_per_gpu": 16631.1, "total_tokens": 392122605 }, { "epoch": 0.24818704676169043, "grad_norm": 0.8990854620933533, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3970, "tokens_per_second_per_gpu": 16944.55, "total_tokens": 392219605 }, { "epoch": 0.24824956239059764, "grad_norm": 0.9680609107017517, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3971, "tokens_per_second_per_gpu": 16804.19, "total_tokens": 392319754 }, { "epoch": 0.2483120780195049, "grad_norm": 0.9767401218414307, "learning_rate": 2e-05, "loss": 0.7092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3972, "tokens_per_second_per_gpu": 16605.79, "total_tokens": 392417567 }, { "epoch": 0.2483745936484121, "grad_norm": 0.9582995176315308, "learning_rate": 2e-05, "loss": 0.7285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3973, "tokens_per_second_per_gpu": 18832.35, "total_tokens": 392519604 }, { "epoch": 0.24843710927731932, "grad_norm": 0.9291030168533325, "learning_rate": 2e-05, "loss": 0.712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3974, "tokens_per_second_per_gpu": 17077.34, "total_tokens": 392620855 }, { "epoch": 0.24849962490622657, "grad_norm": 0.8891448974609375, "learning_rate": 2e-05, "loss": 0.6836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3975, "tokens_per_second_per_gpu": 16805.61, "total_tokens": 392717457 }, { "epoch": 0.24856214053513379, "grad_norm": 0.9162697792053223, "learning_rate": 2e-05, "loss": 0.679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3976, "tokens_per_second_per_gpu": 18430.1, "total_tokens": 392814870 }, { "epoch": 0.248624656164041, "grad_norm": 0.8713641166687012, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3977, "tokens_per_second_per_gpu": 18473.83, "total_tokens": 392915553 }, { "epoch": 0.24868717179294825, "grad_norm": 0.907451331615448, "learning_rate": 2e-05, "loss": 0.6979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3978, "tokens_per_second_per_gpu": 16430.12, "total_tokens": 393014141 }, { "epoch": 0.24874968742185546, "grad_norm": 0.9232596158981323, "learning_rate": 2e-05, "loss": 0.7248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3979, "tokens_per_second_per_gpu": 17940.42, "total_tokens": 393111175 }, { "epoch": 0.24881220305076268, "grad_norm": 0.9452414512634277, "learning_rate": 2e-05, "loss": 0.691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3980, "tokens_per_second_per_gpu": 17083.04, "total_tokens": 393208627 }, { "epoch": 0.24887471867966993, "grad_norm": 0.9032979011535645, "learning_rate": 2e-05, "loss": 0.6895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3981, "tokens_per_second_per_gpu": 17324.52, "total_tokens": 393308172 }, { "epoch": 0.24893723430857714, "grad_norm": 0.9085468649864197, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3982, "tokens_per_second_per_gpu": 17658.96, "total_tokens": 393407478 }, { "epoch": 0.24899974993748436, "grad_norm": 0.931682288646698, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3983, "tokens_per_second_per_gpu": 17685.85, "total_tokens": 393501575 }, { "epoch": 0.2490622655663916, "grad_norm": 0.9106594324111938, "learning_rate": 2e-05, "loss": 0.7146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3984, "tokens_per_second_per_gpu": 18552.62, "total_tokens": 393601329 }, { "epoch": 0.24912478119529882, "grad_norm": 0.9391430020332336, "learning_rate": 2e-05, "loss": 0.6935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3985, "tokens_per_second_per_gpu": 15585.18, "total_tokens": 393692923 }, { "epoch": 0.24918729682420604, "grad_norm": 0.8886687755584717, "learning_rate": 2e-05, "loss": 0.6914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3986, "tokens_per_second_per_gpu": 17815.83, "total_tokens": 393792498 }, { "epoch": 0.24924981245311328, "grad_norm": 0.9017617106437683, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3987, "tokens_per_second_per_gpu": 17200.46, "total_tokens": 393890571 }, { "epoch": 0.2493123280820205, "grad_norm": 0.9184533357620239, "learning_rate": 2e-05, "loss": 0.7181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3988, "tokens_per_second_per_gpu": 17365.52, "total_tokens": 393992061 }, { "epoch": 0.24937484371092772, "grad_norm": 0.9222050905227661, "learning_rate": 2e-05, "loss": 0.7203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3989, "tokens_per_second_per_gpu": 16736.43, "total_tokens": 394088574 }, { "epoch": 0.24943735933983496, "grad_norm": 0.9136229157447815, "learning_rate": 2e-05, "loss": 0.7204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3990, "tokens_per_second_per_gpu": 16529.18, "total_tokens": 394187920 }, { "epoch": 0.24949987496874218, "grad_norm": 0.8973182439804077, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3991, "tokens_per_second_per_gpu": 17896.81, "total_tokens": 394287707 }, { "epoch": 0.24956239059764943, "grad_norm": 0.9032609462738037, "learning_rate": 2e-05, "loss": 0.6993, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3992, "tokens_per_second_per_gpu": 17518.99, "total_tokens": 394386974 }, { "epoch": 0.24962490622655664, "grad_norm": 0.9643526077270508, "learning_rate": 2e-05, "loss": 0.6783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3993, "tokens_per_second_per_gpu": 17693.49, "total_tokens": 394482076 }, { "epoch": 0.24968742185546386, "grad_norm": 0.9212192296981812, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3994, "tokens_per_second_per_gpu": 16388.93, "total_tokens": 394575644 }, { "epoch": 0.2497499374843711, "grad_norm": 0.8717874884605408, "learning_rate": 2e-05, "loss": 0.6858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3995, "tokens_per_second_per_gpu": 17823.43, "total_tokens": 394678414 }, { "epoch": 0.24981245311327832, "grad_norm": 0.9028378129005432, "learning_rate": 2e-05, "loss": 0.7264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3996, "tokens_per_second_per_gpu": 18066.71, "total_tokens": 394779483 }, { "epoch": 0.24987496874218554, "grad_norm": 0.9052352905273438, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3997, "tokens_per_second_per_gpu": 17282.63, "total_tokens": 394879087 }, { "epoch": 0.24993748437109278, "grad_norm": 0.8924785256385803, "learning_rate": 2e-05, "loss": 0.7063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3998, "tokens_per_second_per_gpu": 17185.28, "total_tokens": 394980369 }, { "epoch": 0.25, "grad_norm": 0.8788015246391296, "learning_rate": 2e-05, "loss": 0.6696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 3999, "tokens_per_second_per_gpu": 17245.62, "total_tokens": 395080553 }, { "epoch": 0.25006251562890724, "grad_norm": 0.8824039697647095, "learning_rate": 2e-05, "loss": 0.704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4000, "tokens_per_second_per_gpu": 17421.61, "total_tokens": 395182105 }, { "epoch": 0.25012503125781443, "grad_norm": 0.8768668174743652, "learning_rate": 2e-05, "loss": 0.6831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4001, "tokens_per_second_per_gpu": 18014.16, "total_tokens": 395284704 }, { "epoch": 0.2501875468867217, "grad_norm": 0.8853370547294617, "learning_rate": 2e-05, "loss": 0.6676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4002, "tokens_per_second_per_gpu": 16722.76, "total_tokens": 395381440 }, { "epoch": 0.2502500625156289, "grad_norm": 1.0102813243865967, "learning_rate": 2e-05, "loss": 0.6784, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4003, "tokens_per_second_per_gpu": 16442.93, "total_tokens": 395477552 }, { "epoch": 0.2503125781445361, "grad_norm": 0.9428364038467407, "learning_rate": 2e-05, "loss": 0.7151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4004, "tokens_per_second_per_gpu": 17391.45, "total_tokens": 395579323 }, { "epoch": 0.25037509377344336, "grad_norm": 0.8840007185935974, "learning_rate": 2e-05, "loss": 0.7333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4005, "tokens_per_second_per_gpu": 18331.82, "total_tokens": 395684787 }, { "epoch": 0.2504376094023506, "grad_norm": 1.0286078453063965, "learning_rate": 2e-05, "loss": 0.6975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4006, "tokens_per_second_per_gpu": 17100.44, "total_tokens": 395780308 }, { "epoch": 0.2505001250312578, "grad_norm": 0.9627780914306641, "learning_rate": 2e-05, "loss": 0.6975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4007, "tokens_per_second_per_gpu": 17481.68, "total_tokens": 395881353 }, { "epoch": 0.25056264066016504, "grad_norm": 0.9521011114120483, "learning_rate": 2e-05, "loss": 0.6813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4008, "tokens_per_second_per_gpu": 17121.16, "total_tokens": 395975527 }, { "epoch": 0.2506251562890723, "grad_norm": 0.8959434628486633, "learning_rate": 2e-05, "loss": 0.6879, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4009, "tokens_per_second_per_gpu": 16870.35, "total_tokens": 396074208 }, { "epoch": 0.25068767191797947, "grad_norm": 0.8610255718231201, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4010, "tokens_per_second_per_gpu": 17216.46, "total_tokens": 396172820 }, { "epoch": 0.2507501875468867, "grad_norm": 0.8995857238769531, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4011, "tokens_per_second_per_gpu": 17364.73, "total_tokens": 396268795 }, { "epoch": 0.25081270317579396, "grad_norm": 0.9109286069869995, "learning_rate": 2e-05, "loss": 0.7112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4012, "tokens_per_second_per_gpu": 17225.22, "total_tokens": 396365454 }, { "epoch": 0.25087521880470115, "grad_norm": 0.9272668361663818, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4013, "tokens_per_second_per_gpu": 16596.63, "total_tokens": 396460505 }, { "epoch": 0.2509377344336084, "grad_norm": 0.8549731969833374, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4014, "tokens_per_second_per_gpu": 17170.3, "total_tokens": 396561065 }, { "epoch": 0.25100025006251564, "grad_norm": 0.907321035861969, "learning_rate": 2e-05, "loss": 0.7052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4015, "tokens_per_second_per_gpu": 17960.26, "total_tokens": 396657595 }, { "epoch": 0.25106276569142283, "grad_norm": 0.9229219555854797, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4016, "tokens_per_second_per_gpu": 16966.52, "total_tokens": 396752588 }, { "epoch": 0.2511252813203301, "grad_norm": 0.9344457387924194, "learning_rate": 2e-05, "loss": 0.7352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4017, "tokens_per_second_per_gpu": 17585.12, "total_tokens": 396852807 }, { "epoch": 0.2511877969492373, "grad_norm": 0.8989630937576294, "learning_rate": 2e-05, "loss": 0.695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4018, "tokens_per_second_per_gpu": 17806.7, "total_tokens": 396954880 }, { "epoch": 0.25125031257814456, "grad_norm": 0.92384934425354, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4019, "tokens_per_second_per_gpu": 17144.16, "total_tokens": 397054974 }, { "epoch": 0.25131282820705175, "grad_norm": 0.9180188775062561, "learning_rate": 2e-05, "loss": 0.6847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4020, "tokens_per_second_per_gpu": 16464.13, "total_tokens": 397149391 }, { "epoch": 0.251375343835959, "grad_norm": 0.9585952162742615, "learning_rate": 2e-05, "loss": 0.7085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4021, "tokens_per_second_per_gpu": 15352.83, "total_tokens": 397240876 }, { "epoch": 0.25143785946486624, "grad_norm": 0.859204888343811, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4022, "tokens_per_second_per_gpu": 17420.57, "total_tokens": 397342385 }, { "epoch": 0.25150037509377343, "grad_norm": 0.8893321752548218, "learning_rate": 2e-05, "loss": 0.7173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4023, "tokens_per_second_per_gpu": 18022.53, "total_tokens": 397444709 }, { "epoch": 0.2515628907226807, "grad_norm": 0.909244179725647, "learning_rate": 2e-05, "loss": 0.7034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4024, "tokens_per_second_per_gpu": 16149.16, "total_tokens": 397538089 }, { "epoch": 0.2516254063515879, "grad_norm": 0.9326635003089905, "learning_rate": 2e-05, "loss": 0.7491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4025, "tokens_per_second_per_gpu": 17402.86, "total_tokens": 397634959 }, { "epoch": 0.2516879219804951, "grad_norm": 0.8697532415390015, "learning_rate": 2e-05, "loss": 0.7114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4026, "tokens_per_second_per_gpu": 17380.95, "total_tokens": 397734037 }, { "epoch": 0.25175043760940236, "grad_norm": 0.9171710014343262, "learning_rate": 2e-05, "loss": 0.7774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4027, "tokens_per_second_per_gpu": 18262.71, "total_tokens": 397835922 }, { "epoch": 0.2518129532383096, "grad_norm": 0.928442120552063, "learning_rate": 2e-05, "loss": 0.7327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4028, "tokens_per_second_per_gpu": 17025.93, "total_tokens": 397935104 }, { "epoch": 0.2518754688672168, "grad_norm": 0.8913078904151917, "learning_rate": 2e-05, "loss": 0.6904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4029, "tokens_per_second_per_gpu": 17312.43, "total_tokens": 398033061 }, { "epoch": 0.25193798449612403, "grad_norm": 0.9106675386428833, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4030, "tokens_per_second_per_gpu": 16987.06, "total_tokens": 398128663 }, { "epoch": 0.2520005001250313, "grad_norm": 0.8962734937667847, "learning_rate": 2e-05, "loss": 0.677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4031, "tokens_per_second_per_gpu": 17590.42, "total_tokens": 398226730 }, { "epoch": 0.25206301575393847, "grad_norm": 0.8750914931297302, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4032, "tokens_per_second_per_gpu": 18643.58, "total_tokens": 398328082 }, { "epoch": 0.2521255313828457, "grad_norm": 0.8986339569091797, "learning_rate": 2e-05, "loss": 0.6976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4033, "tokens_per_second_per_gpu": 17606.93, "total_tokens": 398426264 }, { "epoch": 0.25218804701175296, "grad_norm": 0.9081829786300659, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4034, "tokens_per_second_per_gpu": 18474.86, "total_tokens": 398527907 }, { "epoch": 0.25225056264066015, "grad_norm": 0.9277908205986023, "learning_rate": 2e-05, "loss": 0.7322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4035, "tokens_per_second_per_gpu": 17863.06, "total_tokens": 398627681 }, { "epoch": 0.2523130782695674, "grad_norm": 0.9688423275947571, "learning_rate": 2e-05, "loss": 0.71, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4036, "tokens_per_second_per_gpu": 17652.08, "total_tokens": 398724215 }, { "epoch": 0.25237559389847464, "grad_norm": 0.8742895722389221, "learning_rate": 2e-05, "loss": 0.7023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4037, "tokens_per_second_per_gpu": 18479.63, "total_tokens": 398828403 }, { "epoch": 0.2524381095273818, "grad_norm": 0.8974713683128357, "learning_rate": 2e-05, "loss": 0.6739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4038, "tokens_per_second_per_gpu": 17332.69, "total_tokens": 398927803 }, { "epoch": 0.25250062515628907, "grad_norm": 0.9143415689468384, "learning_rate": 2e-05, "loss": 0.6833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4039, "tokens_per_second_per_gpu": 17790.7, "total_tokens": 399025648 }, { "epoch": 0.2525631407851963, "grad_norm": 0.9778472781181335, "learning_rate": 2e-05, "loss": 0.7248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4040, "tokens_per_second_per_gpu": 19340.1, "total_tokens": 399128214 }, { "epoch": 0.2526256564141035, "grad_norm": 0.9135240912437439, "learning_rate": 2e-05, "loss": 0.6861, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4041, "tokens_per_second_per_gpu": 16903.21, "total_tokens": 399223930 }, { "epoch": 0.25268817204301075, "grad_norm": 0.9267925024032593, "learning_rate": 2e-05, "loss": 0.6731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4042, "tokens_per_second_per_gpu": 18167.46, "total_tokens": 399324109 }, { "epoch": 0.252750687671918, "grad_norm": 0.903289794921875, "learning_rate": 2e-05, "loss": 0.6966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4043, "tokens_per_second_per_gpu": 17247.03, "total_tokens": 399422232 }, { "epoch": 0.2528132033008252, "grad_norm": 0.9452382922172546, "learning_rate": 2e-05, "loss": 0.7299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4044, "tokens_per_second_per_gpu": 17888.28, "total_tokens": 399519394 }, { "epoch": 0.25287571892973243, "grad_norm": 0.9656436443328857, "learning_rate": 2e-05, "loss": 0.6857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4045, "tokens_per_second_per_gpu": 15217.92, "total_tokens": 399607231 }, { "epoch": 0.2529382345586397, "grad_norm": 0.9227545261383057, "learning_rate": 2e-05, "loss": 0.6795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4046, "tokens_per_second_per_gpu": 17498.89, "total_tokens": 399704444 }, { "epoch": 0.25300075018754686, "grad_norm": 0.9064651131629944, "learning_rate": 2e-05, "loss": 0.679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4047, "tokens_per_second_per_gpu": 17519.21, "total_tokens": 399802379 }, { "epoch": 0.2530632658164541, "grad_norm": 0.9365736246109009, "learning_rate": 2e-05, "loss": 0.7062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4048, "tokens_per_second_per_gpu": 16858.55, "total_tokens": 399894789 }, { "epoch": 0.25312578144536135, "grad_norm": 0.9617698788642883, "learning_rate": 2e-05, "loss": 0.7212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4049, "tokens_per_second_per_gpu": 17099.23, "total_tokens": 399992934 }, { "epoch": 0.25318829707426854, "grad_norm": 0.9618562459945679, "learning_rate": 2e-05, "loss": 0.6929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4050, "tokens_per_second_per_gpu": 17504.37, "total_tokens": 400092785 }, { "epoch": 0.2532508127031758, "grad_norm": 0.961036741733551, "learning_rate": 2e-05, "loss": 0.695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4051, "tokens_per_second_per_gpu": 16898.34, "total_tokens": 400190094 }, { "epoch": 0.25331332833208303, "grad_norm": 0.8615933060646057, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4052, "tokens_per_second_per_gpu": 17422.41, "total_tokens": 400287798 }, { "epoch": 0.2533758439609902, "grad_norm": 0.9124388694763184, "learning_rate": 2e-05, "loss": 0.676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4053, "tokens_per_second_per_gpu": 17960.4, "total_tokens": 400386718 }, { "epoch": 0.25343835958989747, "grad_norm": 0.9082530736923218, "learning_rate": 2e-05, "loss": 0.681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4054, "tokens_per_second_per_gpu": 18101.53, "total_tokens": 400486068 }, { "epoch": 0.2535008752188047, "grad_norm": 0.9458401799201965, "learning_rate": 2e-05, "loss": 0.7168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4055, "tokens_per_second_per_gpu": 16829.02, "total_tokens": 400579301 }, { "epoch": 0.2535633908477119, "grad_norm": 0.93709796667099, "learning_rate": 2e-05, "loss": 0.7372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4056, "tokens_per_second_per_gpu": 17261.81, "total_tokens": 400677084 }, { "epoch": 0.25362590647661915, "grad_norm": 0.8978529572486877, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4057, "tokens_per_second_per_gpu": 17892.14, "total_tokens": 400774981 }, { "epoch": 0.2536884221055264, "grad_norm": 0.8906385898590088, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4058, "tokens_per_second_per_gpu": 17696.88, "total_tokens": 400872913 }, { "epoch": 0.25375093773443363, "grad_norm": 0.9473224878311157, "learning_rate": 2e-05, "loss": 0.7076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4059, "tokens_per_second_per_gpu": 16226.19, "total_tokens": 400965634 }, { "epoch": 0.2538134533633408, "grad_norm": 0.9085614085197449, "learning_rate": 2e-05, "loss": 0.6834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4060, "tokens_per_second_per_gpu": 17676.94, "total_tokens": 401064468 }, { "epoch": 0.25387596899224807, "grad_norm": 0.9195953011512756, "learning_rate": 2e-05, "loss": 0.6538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4061, "tokens_per_second_per_gpu": 15844.95, "total_tokens": 401156911 }, { "epoch": 0.2539384846211553, "grad_norm": 0.89878910779953, "learning_rate": 2e-05, "loss": 0.6639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4062, "tokens_per_second_per_gpu": 16412.59, "total_tokens": 401252822 }, { "epoch": 0.2540010002500625, "grad_norm": 0.9465223550796509, "learning_rate": 2e-05, "loss": 0.6955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4063, "tokens_per_second_per_gpu": 15840.43, "total_tokens": 401344521 }, { "epoch": 0.25406351587896975, "grad_norm": 0.9175329804420471, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4064, "tokens_per_second_per_gpu": 17003.92, "total_tokens": 401443291 }, { "epoch": 0.254126031507877, "grad_norm": 0.9380756616592407, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4065, "tokens_per_second_per_gpu": 17570.6, "total_tokens": 401542030 }, { "epoch": 0.2541885471367842, "grad_norm": 0.9130515456199646, "learning_rate": 2e-05, "loss": 0.735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4066, "tokens_per_second_per_gpu": 17366.27, "total_tokens": 401642530 }, { "epoch": 0.2542510627656914, "grad_norm": 0.9171011447906494, "learning_rate": 2e-05, "loss": 0.7342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4067, "tokens_per_second_per_gpu": 17225.57, "total_tokens": 401740977 }, { "epoch": 0.25431357839459867, "grad_norm": 0.8993602991104126, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4068, "tokens_per_second_per_gpu": 16275.65, "total_tokens": 401835230 }, { "epoch": 0.25437609402350586, "grad_norm": 0.9702425599098206, "learning_rate": 2e-05, "loss": 0.7029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4069, "tokens_per_second_per_gpu": 17220.65, "total_tokens": 401934104 }, { "epoch": 0.2544386096524131, "grad_norm": 0.9326822757720947, "learning_rate": 2e-05, "loss": 0.7078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4070, "tokens_per_second_per_gpu": 18217.76, "total_tokens": 402033500 }, { "epoch": 0.25450112528132035, "grad_norm": 0.9496045708656311, "learning_rate": 2e-05, "loss": 0.6876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4071, "tokens_per_second_per_gpu": 17042.68, "total_tokens": 402131038 }, { "epoch": 0.25456364091022754, "grad_norm": 0.8892720937728882, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4072, "tokens_per_second_per_gpu": 17160.32, "total_tokens": 402227134 }, { "epoch": 0.2546261565391348, "grad_norm": 0.8508104681968689, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4073, "tokens_per_second_per_gpu": 17792.71, "total_tokens": 402327556 }, { "epoch": 0.25468867216804203, "grad_norm": 0.9648539423942566, "learning_rate": 2e-05, "loss": 0.7112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4074, "tokens_per_second_per_gpu": 18605.07, "total_tokens": 402431332 }, { "epoch": 0.2547511877969492, "grad_norm": 0.9432253241539001, "learning_rate": 2e-05, "loss": 0.6848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4075, "tokens_per_second_per_gpu": 17368.96, "total_tokens": 402529082 }, { "epoch": 0.25481370342585646, "grad_norm": 0.9393872022628784, "learning_rate": 2e-05, "loss": 0.6876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4076, "tokens_per_second_per_gpu": 18023.32, "total_tokens": 402626119 }, { "epoch": 0.2548762190547637, "grad_norm": 0.9301141500473022, "learning_rate": 2e-05, "loss": 0.7056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4077, "tokens_per_second_per_gpu": 17140.6, "total_tokens": 402722476 }, { "epoch": 0.2549387346836709, "grad_norm": 1.0172001123428345, "learning_rate": 2e-05, "loss": 0.6979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4078, "tokens_per_second_per_gpu": 17096.73, "total_tokens": 402818553 }, { "epoch": 0.25500125031257814, "grad_norm": 0.9607576131820679, "learning_rate": 2e-05, "loss": 0.6925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4079, "tokens_per_second_per_gpu": 17354.42, "total_tokens": 402917044 }, { "epoch": 0.2550637659414854, "grad_norm": 0.9630777835845947, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4080, "tokens_per_second_per_gpu": 16357.8, "total_tokens": 403009836 }, { "epoch": 0.2551262815703926, "grad_norm": 0.9113143086433411, "learning_rate": 2e-05, "loss": 0.6844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4081, "tokens_per_second_per_gpu": 16155.15, "total_tokens": 403102787 }, { "epoch": 0.2551887971992998, "grad_norm": 1.01018226146698, "learning_rate": 2e-05, "loss": 0.7235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4082, "tokens_per_second_per_gpu": 17657.74, "total_tokens": 403200201 }, { "epoch": 0.25525131282820707, "grad_norm": 1.0647594928741455, "learning_rate": 2e-05, "loss": 0.6762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4083, "tokens_per_second_per_gpu": 17173.12, "total_tokens": 403298312 }, { "epoch": 0.25531382845711426, "grad_norm": 1.04371976852417, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4084, "tokens_per_second_per_gpu": 17138.61, "total_tokens": 403393077 }, { "epoch": 0.2553763440860215, "grad_norm": 0.9062055945396423, "learning_rate": 2e-05, "loss": 0.6897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4085, "tokens_per_second_per_gpu": 17298.73, "total_tokens": 403488125 }, { "epoch": 0.25543885971492875, "grad_norm": 0.912011981010437, "learning_rate": 2e-05, "loss": 0.7012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4086, "tokens_per_second_per_gpu": 17885.12, "total_tokens": 403587152 }, { "epoch": 0.25550137534383593, "grad_norm": 0.9391490817070007, "learning_rate": 2e-05, "loss": 0.6889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4087, "tokens_per_second_per_gpu": 17674.88, "total_tokens": 403684309 }, { "epoch": 0.2555638909727432, "grad_norm": 1.080936074256897, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4088, "tokens_per_second_per_gpu": 17321.31, "total_tokens": 403782992 }, { "epoch": 0.2556264066016504, "grad_norm": 0.9860252141952515, "learning_rate": 2e-05, "loss": 0.6945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4089, "tokens_per_second_per_gpu": 16647.59, "total_tokens": 403879912 }, { "epoch": 0.2556889222305576, "grad_norm": 0.9915425777435303, "learning_rate": 2e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4090, "tokens_per_second_per_gpu": 17510.22, "total_tokens": 403976676 }, { "epoch": 0.25575143785946486, "grad_norm": 0.9342162013053894, "learning_rate": 2e-05, "loss": 0.7161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4091, "tokens_per_second_per_gpu": 17920.39, "total_tokens": 404080165 }, { "epoch": 0.2558139534883721, "grad_norm": 0.9913167357444763, "learning_rate": 2e-05, "loss": 0.6338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4092, "tokens_per_second_per_gpu": 17479.83, "total_tokens": 404175600 }, { "epoch": 0.2558764691172793, "grad_norm": 0.93712317943573, "learning_rate": 2e-05, "loss": 0.6877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4093, "tokens_per_second_per_gpu": 17107.97, "total_tokens": 404272063 }, { "epoch": 0.25593898474618654, "grad_norm": 0.8731212019920349, "learning_rate": 2e-05, "loss": 0.7248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4094, "tokens_per_second_per_gpu": 17954.06, "total_tokens": 404372353 }, { "epoch": 0.2560015003750938, "grad_norm": 0.9049555659294128, "learning_rate": 2e-05, "loss": 0.7271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4095, "tokens_per_second_per_gpu": 17312.64, "total_tokens": 404469109 }, { "epoch": 0.256064016004001, "grad_norm": 0.9026248455047607, "learning_rate": 2e-05, "loss": 0.7015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4096, "tokens_per_second_per_gpu": 17688.2, "total_tokens": 404568000 }, { "epoch": 0.2561265316329082, "grad_norm": 0.9850150346755981, "learning_rate": 2e-05, "loss": 0.6813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4097, "tokens_per_second_per_gpu": 17648.59, "total_tokens": 404667892 }, { "epoch": 0.25618904726181546, "grad_norm": 0.9717612266540527, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4098, "tokens_per_second_per_gpu": 17832.58, "total_tokens": 404770797 }, { "epoch": 0.2562515628907227, "grad_norm": 0.8951855897903442, "learning_rate": 2e-05, "loss": 0.7156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4099, "tokens_per_second_per_gpu": 17295.94, "total_tokens": 404870548 }, { "epoch": 0.2563140785196299, "grad_norm": 0.9550855755805969, "learning_rate": 2e-05, "loss": 0.751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4100, "tokens_per_second_per_gpu": 17694.34, "total_tokens": 404972179 }, { "epoch": 0.25637659414853714, "grad_norm": 0.9078063368797302, "learning_rate": 2e-05, "loss": 0.7209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4101, "tokens_per_second_per_gpu": 17868.57, "total_tokens": 405069017 }, { "epoch": 0.2564391097774444, "grad_norm": 0.8810771703720093, "learning_rate": 2e-05, "loss": 0.691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4102, "tokens_per_second_per_gpu": 16777.02, "total_tokens": 405164943 }, { "epoch": 0.2565016254063516, "grad_norm": 0.9193295240402222, "learning_rate": 2e-05, "loss": 0.68, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4103, "tokens_per_second_per_gpu": 16686.36, "total_tokens": 405263665 }, { "epoch": 0.2565641410352588, "grad_norm": 0.8703629374504089, "learning_rate": 2e-05, "loss": 0.7218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4104, "tokens_per_second_per_gpu": 17335.27, "total_tokens": 405364994 }, { "epoch": 0.25662665666416606, "grad_norm": 0.8885310292243958, "learning_rate": 2e-05, "loss": 0.6846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4105, "tokens_per_second_per_gpu": 16765.73, "total_tokens": 405465341 }, { "epoch": 0.25668917229307325, "grad_norm": 0.8994097709655762, "learning_rate": 2e-05, "loss": 0.6941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4106, "tokens_per_second_per_gpu": 17735.9, "total_tokens": 405564002 }, { "epoch": 0.2567516879219805, "grad_norm": 0.9578073024749756, "learning_rate": 2e-05, "loss": 0.7104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4107, "tokens_per_second_per_gpu": 17247.18, "total_tokens": 405662434 }, { "epoch": 0.25681420355088774, "grad_norm": 0.8766051530838013, "learning_rate": 2e-05, "loss": 0.6889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4108, "tokens_per_second_per_gpu": 17324.37, "total_tokens": 405762151 }, { "epoch": 0.25687671917979493, "grad_norm": 0.9300903081893921, "learning_rate": 2e-05, "loss": 0.7178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4109, "tokens_per_second_per_gpu": 17362.61, "total_tokens": 405859806 }, { "epoch": 0.2569392348087022, "grad_norm": 0.8885295987129211, "learning_rate": 2e-05, "loss": 0.6656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4110, "tokens_per_second_per_gpu": 16261.43, "total_tokens": 405954506 }, { "epoch": 0.2570017504376094, "grad_norm": 0.9187685251235962, "learning_rate": 2e-05, "loss": 0.6986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4111, "tokens_per_second_per_gpu": 16505.04, "total_tokens": 406049544 }, { "epoch": 0.2570642660665166, "grad_norm": 0.9422640800476074, "learning_rate": 2e-05, "loss": 0.6801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4112, "tokens_per_second_per_gpu": 17799.65, "total_tokens": 406151788 }, { "epoch": 0.25712678169542386, "grad_norm": 0.8951693177223206, "learning_rate": 2e-05, "loss": 0.6964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4113, "tokens_per_second_per_gpu": 16148.66, "total_tokens": 406244652 }, { "epoch": 0.2571892973243311, "grad_norm": 0.9450216293334961, "learning_rate": 2e-05, "loss": 0.7263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4114, "tokens_per_second_per_gpu": 17522.43, "total_tokens": 406343269 }, { "epoch": 0.2572518129532383, "grad_norm": 0.8778443932533264, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4115, "tokens_per_second_per_gpu": 16968.52, "total_tokens": 406440156 }, { "epoch": 0.25731432858214554, "grad_norm": 0.9003824591636658, "learning_rate": 2e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4116, "tokens_per_second_per_gpu": 17349.77, "total_tokens": 406534196 }, { "epoch": 0.2573768442110528, "grad_norm": 0.9331483244895935, "learning_rate": 2e-05, "loss": 0.6772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4117, "tokens_per_second_per_gpu": 17103.83, "total_tokens": 406634935 }, { "epoch": 0.25743935983995997, "grad_norm": 0.914043664932251, "learning_rate": 2e-05, "loss": 0.6801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4118, "tokens_per_second_per_gpu": 16769.64, "total_tokens": 406733258 }, { "epoch": 0.2575018754688672, "grad_norm": 0.9236394166946411, "learning_rate": 2e-05, "loss": 0.7078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4119, "tokens_per_second_per_gpu": 17426.96, "total_tokens": 406830697 }, { "epoch": 0.25756439109777446, "grad_norm": 0.9332876801490784, "learning_rate": 2e-05, "loss": 0.7168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4120, "tokens_per_second_per_gpu": 18448.45, "total_tokens": 406928121 }, { "epoch": 0.25762690672668165, "grad_norm": 0.9203969836235046, "learning_rate": 2e-05, "loss": 0.691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4121, "tokens_per_second_per_gpu": 16688.09, "total_tokens": 407024229 }, { "epoch": 0.2576894223555889, "grad_norm": 0.9083579182624817, "learning_rate": 2e-05, "loss": 0.68, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4122, "tokens_per_second_per_gpu": 16668.66, "total_tokens": 407122137 }, { "epoch": 0.25775193798449614, "grad_norm": 0.9412323832511902, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4123, "tokens_per_second_per_gpu": 15247.93, "total_tokens": 407213333 }, { "epoch": 0.2578144536134033, "grad_norm": 0.9330040216445923, "learning_rate": 2e-05, "loss": 0.7074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4124, "tokens_per_second_per_gpu": 17995.71, "total_tokens": 407314783 }, { "epoch": 0.25787696924231057, "grad_norm": 0.9280405640602112, "learning_rate": 2e-05, "loss": 0.7168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4125, "tokens_per_second_per_gpu": 17280.48, "total_tokens": 407413024 }, { "epoch": 0.2579394848712178, "grad_norm": 0.9024964570999146, "learning_rate": 2e-05, "loss": 0.7334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4126, "tokens_per_second_per_gpu": 18348.16, "total_tokens": 407518638 }, { "epoch": 0.258002000500125, "grad_norm": 1.0144437551498413, "learning_rate": 2e-05, "loss": 0.7253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4127, "tokens_per_second_per_gpu": 16677.99, "total_tokens": 407617279 }, { "epoch": 0.25806451612903225, "grad_norm": 0.9055246114730835, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4128, "tokens_per_second_per_gpu": 15798.04, "total_tokens": 407709690 }, { "epoch": 0.2581270317579395, "grad_norm": 0.9441051483154297, "learning_rate": 2e-05, "loss": 0.7131, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4129, "tokens_per_second_per_gpu": 16269.59, "total_tokens": 407804265 }, { "epoch": 0.2581895473868467, "grad_norm": 0.9255509972572327, "learning_rate": 2e-05, "loss": 0.7178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4130, "tokens_per_second_per_gpu": 16470.04, "total_tokens": 407901913 }, { "epoch": 0.25825206301575393, "grad_norm": 0.8952970504760742, "learning_rate": 2e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4131, "tokens_per_second_per_gpu": 16858.17, "total_tokens": 408000966 }, { "epoch": 0.2583145786446612, "grad_norm": 1.0838285684585571, "learning_rate": 2e-05, "loss": 0.7278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4132, "tokens_per_second_per_gpu": 16599.63, "total_tokens": 408095077 }, { "epoch": 0.2583770942735684, "grad_norm": 0.9047045707702637, "learning_rate": 2e-05, "loss": 0.6688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4133, "tokens_per_second_per_gpu": 16893.29, "total_tokens": 408192498 }, { "epoch": 0.2584396099024756, "grad_norm": 0.9117662310600281, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4134, "tokens_per_second_per_gpu": 15769.06, "total_tokens": 408286311 }, { "epoch": 0.25850212553138285, "grad_norm": 0.9353184700012207, "learning_rate": 2e-05, "loss": 0.7086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4135, "tokens_per_second_per_gpu": 17365.85, "total_tokens": 408382168 }, { "epoch": 0.2585646411602901, "grad_norm": 0.979390025138855, "learning_rate": 2e-05, "loss": 0.6898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4136, "tokens_per_second_per_gpu": 15217.73, "total_tokens": 408472262 }, { "epoch": 0.2586271567891973, "grad_norm": 0.8940108418464661, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4137, "tokens_per_second_per_gpu": 16183.24, "total_tokens": 408565736 }, { "epoch": 0.25868967241810453, "grad_norm": 0.9295534491539001, "learning_rate": 2e-05, "loss": 0.7138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4138, "tokens_per_second_per_gpu": 18151.87, "total_tokens": 408665587 }, { "epoch": 0.2587521880470118, "grad_norm": 0.9624873995780945, "learning_rate": 2e-05, "loss": 0.6854, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4139, "tokens_per_second_per_gpu": 16535.27, "total_tokens": 408758164 }, { "epoch": 0.25881470367591897, "grad_norm": 0.885453999042511, "learning_rate": 2e-05, "loss": 0.6482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4140, "tokens_per_second_per_gpu": 16471.85, "total_tokens": 408854081 }, { "epoch": 0.2588772193048262, "grad_norm": 0.8930432200431824, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4141, "tokens_per_second_per_gpu": 18194.95, "total_tokens": 408952551 }, { "epoch": 0.25893973493373346, "grad_norm": 0.9370989799499512, "learning_rate": 2e-05, "loss": 0.7353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4142, "tokens_per_second_per_gpu": 16702.58, "total_tokens": 409051246 }, { "epoch": 0.25900225056264065, "grad_norm": 0.9559011459350586, "learning_rate": 2e-05, "loss": 0.7033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4143, "tokens_per_second_per_gpu": 15853.75, "total_tokens": 409144042 }, { "epoch": 0.2590647661915479, "grad_norm": 0.8930439352989197, "learning_rate": 2e-05, "loss": 0.6448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4144, "tokens_per_second_per_gpu": 16951.38, "total_tokens": 409239595 }, { "epoch": 0.25912728182045514, "grad_norm": 0.9479560852050781, "learning_rate": 2e-05, "loss": 0.7123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4145, "tokens_per_second_per_gpu": 15512.94, "total_tokens": 409332789 }, { "epoch": 0.2591897974493623, "grad_norm": 0.9261783957481384, "learning_rate": 2e-05, "loss": 0.7322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4146, "tokens_per_second_per_gpu": 17601.58, "total_tokens": 409432340 }, { "epoch": 0.25925231307826957, "grad_norm": 0.9422135949134827, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4147, "tokens_per_second_per_gpu": 15808.05, "total_tokens": 409526464 }, { "epoch": 0.2593148287071768, "grad_norm": 0.9128454923629761, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4148, "tokens_per_second_per_gpu": 15518.12, "total_tokens": 409617844 }, { "epoch": 0.259377344336084, "grad_norm": 0.9338565468788147, "learning_rate": 2e-05, "loss": 0.6992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4149, "tokens_per_second_per_gpu": 17394.67, "total_tokens": 409713867 }, { "epoch": 0.25943985996499125, "grad_norm": 0.8934237360954285, "learning_rate": 2e-05, "loss": 0.7365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4150, "tokens_per_second_per_gpu": 17692.62, "total_tokens": 409815496 }, { "epoch": 0.2595023755938985, "grad_norm": 0.9797597527503967, "learning_rate": 2e-05, "loss": 0.679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4151, "tokens_per_second_per_gpu": 15432.87, "total_tokens": 409906289 }, { "epoch": 0.2595648912228057, "grad_norm": 0.9344868659973145, "learning_rate": 2e-05, "loss": 0.7062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4152, "tokens_per_second_per_gpu": 16940.99, "total_tokens": 410001921 }, { "epoch": 0.2596274068517129, "grad_norm": 0.9488946199417114, "learning_rate": 2e-05, "loss": 0.716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4153, "tokens_per_second_per_gpu": 16462.98, "total_tokens": 410101117 }, { "epoch": 0.2596899224806202, "grad_norm": 0.9028172492980957, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4154, "tokens_per_second_per_gpu": 17488.89, "total_tokens": 410198486 }, { "epoch": 0.25975243810952736, "grad_norm": 0.9107809066772461, "learning_rate": 2e-05, "loss": 0.6692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4155, "tokens_per_second_per_gpu": 16130.99, "total_tokens": 410292460 }, { "epoch": 0.2598149537384346, "grad_norm": 0.9549922347068787, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4156, "tokens_per_second_per_gpu": 15852.49, "total_tokens": 410385189 }, { "epoch": 0.25987746936734185, "grad_norm": 0.9137200713157654, "learning_rate": 2e-05, "loss": 0.7287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4157, "tokens_per_second_per_gpu": 17284.63, "total_tokens": 410482360 }, { "epoch": 0.25993998499624904, "grad_norm": 1.0821585655212402, "learning_rate": 2e-05, "loss": 0.6941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4158, "tokens_per_second_per_gpu": 17185.39, "total_tokens": 410580587 }, { "epoch": 0.2600025006251563, "grad_norm": 0.9388073682785034, "learning_rate": 2e-05, "loss": 0.6898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4159, "tokens_per_second_per_gpu": 17222.08, "total_tokens": 410679039 }, { "epoch": 0.26006501625406353, "grad_norm": 0.9441477656364441, "learning_rate": 2e-05, "loss": 0.7044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4160, "tokens_per_second_per_gpu": 17352.63, "total_tokens": 410778080 }, { "epoch": 0.2601275318829707, "grad_norm": 0.9541816711425781, "learning_rate": 2e-05, "loss": 0.7004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4161, "tokens_per_second_per_gpu": 17274.12, "total_tokens": 410877213 }, { "epoch": 0.26019004751187796, "grad_norm": 0.9182491898536682, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4162, "tokens_per_second_per_gpu": 17179.53, "total_tokens": 410974164 }, { "epoch": 0.2602525631407852, "grad_norm": 0.9899705052375793, "learning_rate": 2e-05, "loss": 0.7368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4163, "tokens_per_second_per_gpu": 17558.32, "total_tokens": 411073914 }, { "epoch": 0.2603150787696924, "grad_norm": 0.9050171375274658, "learning_rate": 2e-05, "loss": 0.698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4164, "tokens_per_second_per_gpu": 17074.73, "total_tokens": 411174333 }, { "epoch": 0.26037759439859964, "grad_norm": 0.9384649991989136, "learning_rate": 2e-05, "loss": 0.6896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4165, "tokens_per_second_per_gpu": 17556.77, "total_tokens": 411270655 }, { "epoch": 0.2604401100275069, "grad_norm": 0.9213833808898926, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4166, "tokens_per_second_per_gpu": 16435.97, "total_tokens": 411369142 }, { "epoch": 0.2605026256564141, "grad_norm": 0.8984628319740295, "learning_rate": 2e-05, "loss": 0.7036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4167, "tokens_per_second_per_gpu": 18010.85, "total_tokens": 411471650 }, { "epoch": 0.2605651412853213, "grad_norm": 0.9004685878753662, "learning_rate": 2e-05, "loss": 0.7064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4168, "tokens_per_second_per_gpu": 17544.03, "total_tokens": 411571614 }, { "epoch": 0.26062765691422857, "grad_norm": 0.8726001381874084, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4169, "tokens_per_second_per_gpu": 17321.82, "total_tokens": 411674199 }, { "epoch": 0.26069017254313576, "grad_norm": 0.9383422136306763, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4170, "tokens_per_second_per_gpu": 15560.9, "total_tokens": 411765760 }, { "epoch": 0.260752688172043, "grad_norm": 0.9703550338745117, "learning_rate": 2e-05, "loss": 0.7179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4171, "tokens_per_second_per_gpu": 18788.19, "total_tokens": 411867998 }, { "epoch": 0.26081520380095025, "grad_norm": 0.9284706115722656, "learning_rate": 2e-05, "loss": 0.7539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4172, "tokens_per_second_per_gpu": 17664.46, "total_tokens": 411968062 }, { "epoch": 0.2608777194298575, "grad_norm": 0.9053377509117126, "learning_rate": 2e-05, "loss": 0.6954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4173, "tokens_per_second_per_gpu": 17521.41, "total_tokens": 412067995 }, { "epoch": 0.2609402350587647, "grad_norm": 0.8981495499610901, "learning_rate": 2e-05, "loss": 0.6837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4174, "tokens_per_second_per_gpu": 17492.54, "total_tokens": 412168763 }, { "epoch": 0.2610027506876719, "grad_norm": 0.9069626331329346, "learning_rate": 2e-05, "loss": 0.7075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4175, "tokens_per_second_per_gpu": 17611.25, "total_tokens": 412268554 }, { "epoch": 0.26106526631657917, "grad_norm": 0.8753193020820618, "learning_rate": 2e-05, "loss": 0.6949, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4176, "tokens_per_second_per_gpu": 17396.54, "total_tokens": 412370405 }, { "epoch": 0.26112778194548636, "grad_norm": 0.9250307083129883, "learning_rate": 2e-05, "loss": 0.7125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4177, "tokens_per_second_per_gpu": 17301.56, "total_tokens": 412471828 }, { "epoch": 0.2611902975743936, "grad_norm": 0.943524181842804, "learning_rate": 2e-05, "loss": 0.6739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4178, "tokens_per_second_per_gpu": 17735.33, "total_tokens": 412573347 }, { "epoch": 0.26125281320330085, "grad_norm": 0.9070818424224854, "learning_rate": 2e-05, "loss": 0.6839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4179, "tokens_per_second_per_gpu": 17663.33, "total_tokens": 412675751 }, { "epoch": 0.26131532883220804, "grad_norm": 0.8949222564697266, "learning_rate": 2e-05, "loss": 0.6832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4180, "tokens_per_second_per_gpu": 16689.87, "total_tokens": 412772402 }, { "epoch": 0.2613778444611153, "grad_norm": 0.9185954928398132, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4181, "tokens_per_second_per_gpu": 16948.3, "total_tokens": 412869787 }, { "epoch": 0.2614403600900225, "grad_norm": 0.8785514831542969, "learning_rate": 2e-05, "loss": 0.6615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4182, "tokens_per_second_per_gpu": 17356.8, "total_tokens": 412969430 }, { "epoch": 0.2615028757189297, "grad_norm": 0.9430795311927795, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4183, "tokens_per_second_per_gpu": 16534.11, "total_tokens": 413064223 }, { "epoch": 0.26156539134783696, "grad_norm": 0.9036682844161987, "learning_rate": 2e-05, "loss": 0.7488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4184, "tokens_per_second_per_gpu": 18378.96, "total_tokens": 413169336 }, { "epoch": 0.2616279069767442, "grad_norm": 0.8721914887428284, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4185, "tokens_per_second_per_gpu": 18612.94, "total_tokens": 413271877 }, { "epoch": 0.2616904226056514, "grad_norm": 0.9082210659980774, "learning_rate": 2e-05, "loss": 0.6773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4186, "tokens_per_second_per_gpu": 17053.83, "total_tokens": 413370019 }, { "epoch": 0.26175293823455864, "grad_norm": 0.9078770279884338, "learning_rate": 2e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4187, "tokens_per_second_per_gpu": 17121.14, "total_tokens": 413470276 }, { "epoch": 0.2618154538634659, "grad_norm": 0.9199742674827576, "learning_rate": 2e-05, "loss": 0.6767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4188, "tokens_per_second_per_gpu": 17489.78, "total_tokens": 413566251 }, { "epoch": 0.2618779694923731, "grad_norm": 0.9109131693840027, "learning_rate": 2e-05, "loss": 0.6775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4189, "tokens_per_second_per_gpu": 17329.83, "total_tokens": 413670841 }, { "epoch": 0.2619404851212803, "grad_norm": 0.8953001499176025, "learning_rate": 2e-05, "loss": 0.6937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4190, "tokens_per_second_per_gpu": 18443.74, "total_tokens": 413774262 }, { "epoch": 0.26200300075018756, "grad_norm": 0.9347081780433655, "learning_rate": 2e-05, "loss": 0.67, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4191, "tokens_per_second_per_gpu": 17074.88, "total_tokens": 413869896 }, { "epoch": 0.26206551637909475, "grad_norm": 0.9378958940505981, "learning_rate": 2e-05, "loss": 0.682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4192, "tokens_per_second_per_gpu": 17860.49, "total_tokens": 413969334 }, { "epoch": 0.262128032008002, "grad_norm": 0.9593502283096313, "learning_rate": 2e-05, "loss": 0.6967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4193, "tokens_per_second_per_gpu": 16940.29, "total_tokens": 414068824 }, { "epoch": 0.26219054763690924, "grad_norm": 0.9000357985496521, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4194, "tokens_per_second_per_gpu": 17022.63, "total_tokens": 414167458 }, { "epoch": 0.26225306326581643, "grad_norm": 1.0082662105560303, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4195, "tokens_per_second_per_gpu": 17014.3, "total_tokens": 414261555 }, { "epoch": 0.2623155788947237, "grad_norm": 0.9167553186416626, "learning_rate": 2e-05, "loss": 0.6576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4196, "tokens_per_second_per_gpu": 17174.87, "total_tokens": 414361069 }, { "epoch": 0.2623780945236309, "grad_norm": 0.9047691226005554, "learning_rate": 2e-05, "loss": 0.6783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4197, "tokens_per_second_per_gpu": 18201.5, "total_tokens": 414462611 }, { "epoch": 0.2624406101525381, "grad_norm": 0.896503746509552, "learning_rate": 2e-05, "loss": 0.6976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4198, "tokens_per_second_per_gpu": 17077.18, "total_tokens": 414562920 }, { "epoch": 0.26250312578144536, "grad_norm": 0.8849107623100281, "learning_rate": 2e-05, "loss": 0.6486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4199, "tokens_per_second_per_gpu": 17353.21, "total_tokens": 414661904 }, { "epoch": 0.2625656414103526, "grad_norm": 0.9103068113327026, "learning_rate": 2e-05, "loss": 0.6955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4200, "tokens_per_second_per_gpu": 17952.6, "total_tokens": 414760575 }, { "epoch": 0.2626281570392598, "grad_norm": 0.9129807949066162, "learning_rate": 2e-05, "loss": 0.6684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4201, "tokens_per_second_per_gpu": 18197.98, "total_tokens": 414864347 }, { "epoch": 0.26269067266816704, "grad_norm": 0.8724545240402222, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4202, "tokens_per_second_per_gpu": 16502.33, "total_tokens": 414962102 }, { "epoch": 0.2627531882970743, "grad_norm": 0.8905046582221985, "learning_rate": 2e-05, "loss": 0.6909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4203, "tokens_per_second_per_gpu": 17481.95, "total_tokens": 415061019 }, { "epoch": 0.26281570392598147, "grad_norm": 0.8968703150749207, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4204, "tokens_per_second_per_gpu": 17895.73, "total_tokens": 415160115 }, { "epoch": 0.2628782195548887, "grad_norm": 0.8772841095924377, "learning_rate": 2e-05, "loss": 0.6802, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4205, "tokens_per_second_per_gpu": 18076.68, "total_tokens": 415261577 }, { "epoch": 0.26294073518379596, "grad_norm": 0.886259138584137, "learning_rate": 2e-05, "loss": 0.6783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4206, "tokens_per_second_per_gpu": 17120.46, "total_tokens": 415359593 }, { "epoch": 0.26300325081270315, "grad_norm": 0.9002215266227722, "learning_rate": 2e-05, "loss": 0.7311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4207, "tokens_per_second_per_gpu": 17921.71, "total_tokens": 415461559 }, { "epoch": 0.2630657664416104, "grad_norm": 0.9205129742622375, "learning_rate": 2e-05, "loss": 0.6996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4208, "tokens_per_second_per_gpu": 18407.97, "total_tokens": 415562248 }, { "epoch": 0.26312828207051764, "grad_norm": 0.8791927099227905, "learning_rate": 2e-05, "loss": 0.6581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4209, "tokens_per_second_per_gpu": 16672.15, "total_tokens": 415659264 }, { "epoch": 0.2631907976994249, "grad_norm": 0.9191926121711731, "learning_rate": 2e-05, "loss": 0.6977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4210, "tokens_per_second_per_gpu": 17893.3, "total_tokens": 415760378 }, { "epoch": 0.2632533133283321, "grad_norm": 0.8672735691070557, "learning_rate": 2e-05, "loss": 0.7445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4211, "tokens_per_second_per_gpu": 19046.28, "total_tokens": 415865367 }, { "epoch": 0.2633158289572393, "grad_norm": 0.875020444393158, "learning_rate": 2e-05, "loss": 0.6658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4212, "tokens_per_second_per_gpu": 17881.78, "total_tokens": 415964567 }, { "epoch": 0.26337834458614656, "grad_norm": 0.9293093085289001, "learning_rate": 2e-05, "loss": 0.6538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4213, "tokens_per_second_per_gpu": 16390.47, "total_tokens": 416060659 }, { "epoch": 0.26344086021505375, "grad_norm": 0.8891273140907288, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4214, "tokens_per_second_per_gpu": 17863.06, "total_tokens": 416160886 }, { "epoch": 0.263503375843961, "grad_norm": 0.8824462294578552, "learning_rate": 2e-05, "loss": 0.6613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4215, "tokens_per_second_per_gpu": 17629.97, "total_tokens": 416261854 }, { "epoch": 0.26356589147286824, "grad_norm": 0.8703598380088806, "learning_rate": 2e-05, "loss": 0.6915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4216, "tokens_per_second_per_gpu": 17204.59, "total_tokens": 416360710 }, { "epoch": 0.26362840710177543, "grad_norm": 0.9137673377990723, "learning_rate": 2e-05, "loss": 0.6635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4217, "tokens_per_second_per_gpu": 16193.61, "total_tokens": 416454393 }, { "epoch": 0.2636909227306827, "grad_norm": 0.9191132187843323, "learning_rate": 2e-05, "loss": 0.6815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4218, "tokens_per_second_per_gpu": 16877.24, "total_tokens": 416551903 }, { "epoch": 0.2637534383595899, "grad_norm": 0.8794323205947876, "learning_rate": 2e-05, "loss": 0.6781, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4219, "tokens_per_second_per_gpu": 17071.5, "total_tokens": 416651361 }, { "epoch": 0.2638159539884971, "grad_norm": 0.8836289644241333, "learning_rate": 2e-05, "loss": 0.7035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4220, "tokens_per_second_per_gpu": 18357.14, "total_tokens": 416756369 }, { "epoch": 0.26387846961740435, "grad_norm": 0.9146792888641357, "learning_rate": 2e-05, "loss": 0.6749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4221, "tokens_per_second_per_gpu": 15692.2, "total_tokens": 416847648 }, { "epoch": 0.2639409852463116, "grad_norm": 0.8608711361885071, "learning_rate": 2e-05, "loss": 0.6902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4222, "tokens_per_second_per_gpu": 17068.96, "total_tokens": 416948610 }, { "epoch": 0.2640035008752188, "grad_norm": 0.9195501208305359, "learning_rate": 2e-05, "loss": 0.6729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4223, "tokens_per_second_per_gpu": 16853.31, "total_tokens": 417045914 }, { "epoch": 0.26406601650412603, "grad_norm": 0.8777695894241333, "learning_rate": 2e-05, "loss": 0.6734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4224, "tokens_per_second_per_gpu": 17338.83, "total_tokens": 417145924 }, { "epoch": 0.2641285321330333, "grad_norm": 0.9209270477294922, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4225, "tokens_per_second_per_gpu": 16142.39, "total_tokens": 417241643 }, { "epoch": 0.26419104776194047, "grad_norm": 0.9472132921218872, "learning_rate": 2e-05, "loss": 0.7019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4226, "tokens_per_second_per_gpu": 17271.07, "total_tokens": 417337315 }, { "epoch": 0.2642535633908477, "grad_norm": 0.8949466943740845, "learning_rate": 2e-05, "loss": 0.6932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4227, "tokens_per_second_per_gpu": 18103.34, "total_tokens": 417438859 }, { "epoch": 0.26431607901975496, "grad_norm": 0.9266636371612549, "learning_rate": 2e-05, "loss": 0.6945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4228, "tokens_per_second_per_gpu": 16512.03, "total_tokens": 417534238 }, { "epoch": 0.26437859464866215, "grad_norm": 0.9107522368431091, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4229, "tokens_per_second_per_gpu": 17437.67, "total_tokens": 417635967 }, { "epoch": 0.2644411102775694, "grad_norm": 0.9709533452987671, "learning_rate": 2e-05, "loss": 0.7161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4230, "tokens_per_second_per_gpu": 17294.37, "total_tokens": 417736161 }, { "epoch": 0.26450362590647664, "grad_norm": 0.8809906244277954, "learning_rate": 2e-05, "loss": 0.6817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4231, "tokens_per_second_per_gpu": 18029.51, "total_tokens": 417835432 }, { "epoch": 0.2645661415353838, "grad_norm": 0.9300373196601868, "learning_rate": 2e-05, "loss": 0.6878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4232, "tokens_per_second_per_gpu": 17262.39, "total_tokens": 417934218 }, { "epoch": 0.26462865716429107, "grad_norm": 0.9854077696800232, "learning_rate": 2e-05, "loss": 0.6701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4233, "tokens_per_second_per_gpu": 17386.85, "total_tokens": 418030630 }, { "epoch": 0.2646911727931983, "grad_norm": 0.8574416041374207, "learning_rate": 2e-05, "loss": 0.6618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4234, "tokens_per_second_per_gpu": 18609.5, "total_tokens": 418134763 }, { "epoch": 0.2647536884221055, "grad_norm": 0.9151376485824585, "learning_rate": 2e-05, "loss": 0.6737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4235, "tokens_per_second_per_gpu": 17460.92, "total_tokens": 418235055 }, { "epoch": 0.26481620405101275, "grad_norm": 0.8741106986999512, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4236, "tokens_per_second_per_gpu": 16812.85, "total_tokens": 418331521 }, { "epoch": 0.26487871967992, "grad_norm": 0.9147612452507019, "learning_rate": 2e-05, "loss": 0.7095, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4237, "tokens_per_second_per_gpu": 17765.78, "total_tokens": 418434217 }, { "epoch": 0.2649412353088272, "grad_norm": 0.928931713104248, "learning_rate": 2e-05, "loss": 0.7186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4238, "tokens_per_second_per_gpu": 17734.54, "total_tokens": 418534732 }, { "epoch": 0.26500375093773443, "grad_norm": 0.8642711043357849, "learning_rate": 2e-05, "loss": 0.6846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4239, "tokens_per_second_per_gpu": 16753.37, "total_tokens": 418630919 }, { "epoch": 0.2650662665666417, "grad_norm": 0.9420955777168274, "learning_rate": 2e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4240, "tokens_per_second_per_gpu": 17361.04, "total_tokens": 418727567 }, { "epoch": 0.26512878219554886, "grad_norm": 0.9278744459152222, "learning_rate": 2e-05, "loss": 0.7102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4241, "tokens_per_second_per_gpu": 17637.82, "total_tokens": 418829659 }, { "epoch": 0.2651912978244561, "grad_norm": 0.9083293676376343, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4242, "tokens_per_second_per_gpu": 16621.02, "total_tokens": 418927725 }, { "epoch": 0.26525381345336335, "grad_norm": 0.9343655705451965, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4243, "tokens_per_second_per_gpu": 18225.94, "total_tokens": 419027440 }, { "epoch": 0.26531632908227054, "grad_norm": 0.9153199791908264, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4244, "tokens_per_second_per_gpu": 16909.87, "total_tokens": 419122270 }, { "epoch": 0.2653788447111778, "grad_norm": 0.9627645611763, "learning_rate": 2e-05, "loss": 0.7401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4245, "tokens_per_second_per_gpu": 17400.1, "total_tokens": 419223671 }, { "epoch": 0.26544136034008503, "grad_norm": 0.9391821622848511, "learning_rate": 2e-05, "loss": 0.6692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4246, "tokens_per_second_per_gpu": 17156.41, "total_tokens": 419321296 }, { "epoch": 0.2655038759689923, "grad_norm": 0.9306051135063171, "learning_rate": 2e-05, "loss": 0.6754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4247, "tokens_per_second_per_gpu": 17454.48, "total_tokens": 419422224 }, { "epoch": 0.26556639159789946, "grad_norm": 0.9110625386238098, "learning_rate": 2e-05, "loss": 0.7119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4248, "tokens_per_second_per_gpu": 17394.34, "total_tokens": 419520886 }, { "epoch": 0.2656289072268067, "grad_norm": 0.9102078080177307, "learning_rate": 2e-05, "loss": 0.724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4249, "tokens_per_second_per_gpu": 17669.27, "total_tokens": 419622150 }, { "epoch": 0.26569142285571395, "grad_norm": 0.9509322643280029, "learning_rate": 2e-05, "loss": 0.7204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4250, "tokens_per_second_per_gpu": 16932.64, "total_tokens": 419720518 }, { "epoch": 0.26575393848462114, "grad_norm": 0.9025906920433044, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4251, "tokens_per_second_per_gpu": 16491.64, "total_tokens": 419813864 }, { "epoch": 0.2658164541135284, "grad_norm": 0.932349681854248, "learning_rate": 2e-05, "loss": 0.7111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4252, "tokens_per_second_per_gpu": 16466.55, "total_tokens": 419909596 }, { "epoch": 0.26587896974243563, "grad_norm": 0.8913097977638245, "learning_rate": 2e-05, "loss": 0.6925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4253, "tokens_per_second_per_gpu": 17287.0, "total_tokens": 420010294 }, { "epoch": 0.2659414853713428, "grad_norm": 0.9648452401161194, "learning_rate": 2e-05, "loss": 0.675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4254, "tokens_per_second_per_gpu": 17079.61, "total_tokens": 420107407 }, { "epoch": 0.26600400100025007, "grad_norm": 0.8976430892944336, "learning_rate": 2e-05, "loss": 0.7092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4255, "tokens_per_second_per_gpu": 17509.95, "total_tokens": 420209960 }, { "epoch": 0.2660665166291573, "grad_norm": 0.9098766446113586, "learning_rate": 2e-05, "loss": 0.7052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4256, "tokens_per_second_per_gpu": 17473.59, "total_tokens": 420307373 }, { "epoch": 0.2661290322580645, "grad_norm": 0.9105928540229797, "learning_rate": 2e-05, "loss": 0.6734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4257, "tokens_per_second_per_gpu": 16978.02, "total_tokens": 420406813 }, { "epoch": 0.26619154788697175, "grad_norm": 0.9089936017990112, "learning_rate": 2e-05, "loss": 0.679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4258, "tokens_per_second_per_gpu": 17043.66, "total_tokens": 420504900 }, { "epoch": 0.266254063515879, "grad_norm": 0.9404191374778748, "learning_rate": 2e-05, "loss": 0.7233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4259, "tokens_per_second_per_gpu": 17508.3, "total_tokens": 420603888 }, { "epoch": 0.2663165791447862, "grad_norm": 0.8904896378517151, "learning_rate": 2e-05, "loss": 0.6658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4260, "tokens_per_second_per_gpu": 17745.6, "total_tokens": 420703183 }, { "epoch": 0.2663790947736934, "grad_norm": 0.9092914462089539, "learning_rate": 2e-05, "loss": 0.6924, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4261, "tokens_per_second_per_gpu": 17944.56, "total_tokens": 420804799 }, { "epoch": 0.26644161040260067, "grad_norm": 0.8871991634368896, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4262, "tokens_per_second_per_gpu": 17225.22, "total_tokens": 420902190 }, { "epoch": 0.26650412603150786, "grad_norm": 0.9076506495475769, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4263, "tokens_per_second_per_gpu": 16279.64, "total_tokens": 420998006 }, { "epoch": 0.2665666416604151, "grad_norm": 0.8782954216003418, "learning_rate": 2e-05, "loss": 0.696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4264, "tokens_per_second_per_gpu": 17794.57, "total_tokens": 421100714 }, { "epoch": 0.26662915728932235, "grad_norm": 0.9276363253593445, "learning_rate": 2e-05, "loss": 0.7152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4265, "tokens_per_second_per_gpu": 16397.6, "total_tokens": 421195549 }, { "epoch": 0.26669167291822954, "grad_norm": 0.8785309791564941, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4266, "tokens_per_second_per_gpu": 16424.16, "total_tokens": 421295719 }, { "epoch": 0.2667541885471368, "grad_norm": 0.8972458243370056, "learning_rate": 2e-05, "loss": 0.6989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4267, "tokens_per_second_per_gpu": 18234.11, "total_tokens": 421397789 }, { "epoch": 0.26681670417604403, "grad_norm": 0.88441002368927, "learning_rate": 2e-05, "loss": 0.7076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4268, "tokens_per_second_per_gpu": 17798.3, "total_tokens": 421500182 }, { "epoch": 0.2668792198049512, "grad_norm": 0.8839501142501831, "learning_rate": 2e-05, "loss": 0.6801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4269, "tokens_per_second_per_gpu": 18448.34, "total_tokens": 421601394 }, { "epoch": 0.26694173543385846, "grad_norm": 3.9492056369781494, "learning_rate": 2e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4270, "tokens_per_second_per_gpu": 18155.1, "total_tokens": 421702878 }, { "epoch": 0.2670042510627657, "grad_norm": 0.9147720336914062, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4271, "tokens_per_second_per_gpu": 17374.98, "total_tokens": 421803114 }, { "epoch": 0.2670667666916729, "grad_norm": 0.9092468023300171, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4272, "tokens_per_second_per_gpu": 16730.55, "total_tokens": 421900567 }, { "epoch": 0.26712928232058014, "grad_norm": 0.8909763097763062, "learning_rate": 2e-05, "loss": 0.6724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4273, "tokens_per_second_per_gpu": 16423.1, "total_tokens": 421998258 }, { "epoch": 0.2671917979494874, "grad_norm": 0.9058442115783691, "learning_rate": 2e-05, "loss": 0.6821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4274, "tokens_per_second_per_gpu": 16760.05, "total_tokens": 422096245 }, { "epoch": 0.2672543135783946, "grad_norm": 0.9115772843360901, "learning_rate": 2e-05, "loss": 0.6819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4275, "tokens_per_second_per_gpu": 16190.25, "total_tokens": 422194816 }, { "epoch": 0.2673168292073018, "grad_norm": 0.8694345355033875, "learning_rate": 2e-05, "loss": 0.6785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4276, "tokens_per_second_per_gpu": 17413.84, "total_tokens": 422296977 }, { "epoch": 0.26737934483620907, "grad_norm": 0.8929054141044617, "learning_rate": 2e-05, "loss": 0.6947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4277, "tokens_per_second_per_gpu": 17024.86, "total_tokens": 422397915 }, { "epoch": 0.26744186046511625, "grad_norm": 0.9079039096832275, "learning_rate": 2e-05, "loss": 0.6912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4278, "tokens_per_second_per_gpu": 16301.4, "total_tokens": 422494930 }, { "epoch": 0.2675043760940235, "grad_norm": 0.8567566871643066, "learning_rate": 2e-05, "loss": 0.6644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4279, "tokens_per_second_per_gpu": 16341.62, "total_tokens": 422591701 }, { "epoch": 0.26756689172293074, "grad_norm": 0.8775447607040405, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4280, "tokens_per_second_per_gpu": 18163.96, "total_tokens": 422696128 }, { "epoch": 0.26762940735183793, "grad_norm": 0.9180373549461365, "learning_rate": 2e-05, "loss": 0.6983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4281, "tokens_per_second_per_gpu": 18285.62, "total_tokens": 422796631 }, { "epoch": 0.2676919229807452, "grad_norm": 0.9331507086753845, "learning_rate": 2e-05, "loss": 0.6932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4282, "tokens_per_second_per_gpu": 16991.97, "total_tokens": 422896382 }, { "epoch": 0.2677544386096524, "grad_norm": 0.9323473572731018, "learning_rate": 2e-05, "loss": 0.7189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4283, "tokens_per_second_per_gpu": 18196.23, "total_tokens": 422997002 }, { "epoch": 0.2678169542385596, "grad_norm": 0.8638489842414856, "learning_rate": 2e-05, "loss": 0.6852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4284, "tokens_per_second_per_gpu": 17695.79, "total_tokens": 423096443 }, { "epoch": 0.26787946986746686, "grad_norm": 0.9615659117698669, "learning_rate": 2e-05, "loss": 0.6683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4285, "tokens_per_second_per_gpu": 17250.09, "total_tokens": 423195186 }, { "epoch": 0.2679419854963741, "grad_norm": 0.9254581332206726, "learning_rate": 2e-05, "loss": 0.6627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4286, "tokens_per_second_per_gpu": 18014.48, "total_tokens": 423290971 }, { "epoch": 0.26800450112528135, "grad_norm": 0.940351128578186, "learning_rate": 2e-05, "loss": 0.6982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4287, "tokens_per_second_per_gpu": 17074.7, "total_tokens": 423388442 }, { "epoch": 0.26806701675418854, "grad_norm": 0.8831025958061218, "learning_rate": 2e-05, "loss": 0.6607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4288, "tokens_per_second_per_gpu": 17043.48, "total_tokens": 423484657 }, { "epoch": 0.2681295323830958, "grad_norm": 0.8980444669723511, "learning_rate": 2e-05, "loss": 0.6908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4289, "tokens_per_second_per_gpu": 17239.17, "total_tokens": 423584005 }, { "epoch": 0.268192048012003, "grad_norm": 0.9220784306526184, "learning_rate": 2e-05, "loss": 0.6929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4290, "tokens_per_second_per_gpu": 16328.46, "total_tokens": 423675256 }, { "epoch": 0.2682545636409102, "grad_norm": 0.8697383999824524, "learning_rate": 2e-05, "loss": 0.6612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4291, "tokens_per_second_per_gpu": 17157.14, "total_tokens": 423773707 }, { "epoch": 0.26831707926981746, "grad_norm": 0.9368352890014648, "learning_rate": 2e-05, "loss": 0.6685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4292, "tokens_per_second_per_gpu": 15701.82, "total_tokens": 423867803 }, { "epoch": 0.2683795948987247, "grad_norm": 0.9000068306922913, "learning_rate": 2e-05, "loss": 0.6785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4293, "tokens_per_second_per_gpu": 17451.31, "total_tokens": 423969544 }, { "epoch": 0.2684421105276319, "grad_norm": 0.896850049495697, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4294, "tokens_per_second_per_gpu": 16219.82, "total_tokens": 424061308 }, { "epoch": 0.26850462615653914, "grad_norm": 0.9009484052658081, "learning_rate": 2e-05, "loss": 0.6909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4295, "tokens_per_second_per_gpu": 17450.9, "total_tokens": 424161428 }, { "epoch": 0.2685671417854464, "grad_norm": 0.9014275074005127, "learning_rate": 2e-05, "loss": 0.6954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4296, "tokens_per_second_per_gpu": 17062.82, "total_tokens": 424259489 }, { "epoch": 0.2686296574143536, "grad_norm": 0.9178593158721924, "learning_rate": 2e-05, "loss": 0.6946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4297, "tokens_per_second_per_gpu": 16865.38, "total_tokens": 424362189 }, { "epoch": 0.2686921730432608, "grad_norm": 0.937167227268219, "learning_rate": 2e-05, "loss": 0.6448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4298, "tokens_per_second_per_gpu": 14993.49, "total_tokens": 424453145 }, { "epoch": 0.26875468867216806, "grad_norm": 0.8825377821922302, "learning_rate": 2e-05, "loss": 0.6696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4299, "tokens_per_second_per_gpu": 18417.33, "total_tokens": 424552281 }, { "epoch": 0.26881720430107525, "grad_norm": 0.8899269700050354, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4300, "tokens_per_second_per_gpu": 16725.42, "total_tokens": 424648824 }, { "epoch": 0.2688797199299825, "grad_norm": 0.8684025406837463, "learning_rate": 2e-05, "loss": 0.6834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4301, "tokens_per_second_per_gpu": 18117.3, "total_tokens": 424748293 }, { "epoch": 0.26894223555888974, "grad_norm": 0.8597205281257629, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4302, "tokens_per_second_per_gpu": 18153.47, "total_tokens": 424850342 }, { "epoch": 0.26900475118779693, "grad_norm": 0.8684161901473999, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4303, "tokens_per_second_per_gpu": 17914.18, "total_tokens": 424952908 }, { "epoch": 0.2690672668167042, "grad_norm": 0.9636571407318115, "learning_rate": 2e-05, "loss": 0.6939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4304, "tokens_per_second_per_gpu": 16724.47, "total_tokens": 425043490 }, { "epoch": 0.2691297824456114, "grad_norm": 0.9044768810272217, "learning_rate": 2e-05, "loss": 0.6644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4305, "tokens_per_second_per_gpu": 16446.67, "total_tokens": 425137337 }, { "epoch": 0.2691922980745186, "grad_norm": 0.9272087216377258, "learning_rate": 2e-05, "loss": 0.7297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4306, "tokens_per_second_per_gpu": 17906.69, "total_tokens": 425239343 }, { "epoch": 0.26925481370342585, "grad_norm": 0.871440052986145, "learning_rate": 2e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4307, "tokens_per_second_per_gpu": 17518.72, "total_tokens": 425336906 }, { "epoch": 0.2693173293323331, "grad_norm": 0.9007625579833984, "learning_rate": 2e-05, "loss": 0.6872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4308, "tokens_per_second_per_gpu": 17083.87, "total_tokens": 425431652 }, { "epoch": 0.2693798449612403, "grad_norm": 0.8974846601486206, "learning_rate": 2e-05, "loss": 0.6938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4309, "tokens_per_second_per_gpu": 16485.84, "total_tokens": 425529655 }, { "epoch": 0.26944236059014753, "grad_norm": 0.9547582268714905, "learning_rate": 2e-05, "loss": 0.733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4310, "tokens_per_second_per_gpu": 16339.4, "total_tokens": 425626052 }, { "epoch": 0.2695048762190548, "grad_norm": 0.89569091796875, "learning_rate": 2e-05, "loss": 0.6725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4311, "tokens_per_second_per_gpu": 17008.86, "total_tokens": 425721066 }, { "epoch": 0.26956739184796197, "grad_norm": 0.9463223218917847, "learning_rate": 2e-05, "loss": 0.6749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4312, "tokens_per_second_per_gpu": 16326.83, "total_tokens": 425818147 }, { "epoch": 0.2696299074768692, "grad_norm": 0.8906797170639038, "learning_rate": 2e-05, "loss": 0.706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4313, "tokens_per_second_per_gpu": 17998.06, "total_tokens": 425921264 }, { "epoch": 0.26969242310577646, "grad_norm": 0.8913869261741638, "learning_rate": 2e-05, "loss": 0.6889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4314, "tokens_per_second_per_gpu": 17934.55, "total_tokens": 426021691 }, { "epoch": 0.26975493873468365, "grad_norm": 0.8814448118209839, "learning_rate": 2e-05, "loss": 0.6708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4315, "tokens_per_second_per_gpu": 18394.02, "total_tokens": 426123834 }, { "epoch": 0.2698174543635909, "grad_norm": 0.9048916101455688, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4316, "tokens_per_second_per_gpu": 16597.88, "total_tokens": 426220339 }, { "epoch": 0.26987996999249814, "grad_norm": 0.8770090341567993, "learning_rate": 2e-05, "loss": 0.6694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4317, "tokens_per_second_per_gpu": 17307.4, "total_tokens": 426320137 }, { "epoch": 0.2699424856214053, "grad_norm": 0.8799930810928345, "learning_rate": 2e-05, "loss": 0.7308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4318, "tokens_per_second_per_gpu": 17880.23, "total_tokens": 426424096 }, { "epoch": 0.27000500125031257, "grad_norm": 0.8860856294631958, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4319, "tokens_per_second_per_gpu": 17826.44, "total_tokens": 426527318 }, { "epoch": 0.2700675168792198, "grad_norm": 0.9134451746940613, "learning_rate": 2e-05, "loss": 0.6901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4320, "tokens_per_second_per_gpu": 18132.04, "total_tokens": 426629215 }, { "epoch": 0.270130032508127, "grad_norm": 0.9543973207473755, "learning_rate": 2e-05, "loss": 0.7047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4321, "tokens_per_second_per_gpu": 18623.23, "total_tokens": 426733938 }, { "epoch": 0.27019254813703425, "grad_norm": 0.9756497144699097, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4322, "tokens_per_second_per_gpu": 16127.66, "total_tokens": 426830106 }, { "epoch": 0.2702550637659415, "grad_norm": 0.9202433824539185, "learning_rate": 2e-05, "loss": 0.6679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4323, "tokens_per_second_per_gpu": 16647.75, "total_tokens": 426929277 }, { "epoch": 0.27031757939484874, "grad_norm": 0.8702878952026367, "learning_rate": 2e-05, "loss": 0.6639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4324, "tokens_per_second_per_gpu": 18167.4, "total_tokens": 427031494 }, { "epoch": 0.27038009502375593, "grad_norm": 0.887107253074646, "learning_rate": 2e-05, "loss": 0.6841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4325, "tokens_per_second_per_gpu": 18372.14, "total_tokens": 427130940 }, { "epoch": 0.2704426106526632, "grad_norm": 0.9427415728569031, "learning_rate": 2e-05, "loss": 0.6927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4326, "tokens_per_second_per_gpu": 18104.99, "total_tokens": 427231808 }, { "epoch": 0.2705051262815704, "grad_norm": 0.9571824073791504, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4327, "tokens_per_second_per_gpu": 16348.26, "total_tokens": 427328489 }, { "epoch": 0.2705676419104776, "grad_norm": 0.9439906477928162, "learning_rate": 2e-05, "loss": 0.7115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4328, "tokens_per_second_per_gpu": 16078.64, "total_tokens": 427424078 }, { "epoch": 0.27063015753938485, "grad_norm": 0.953771710395813, "learning_rate": 2e-05, "loss": 0.713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4329, "tokens_per_second_per_gpu": 18119.16, "total_tokens": 427523696 }, { "epoch": 0.2706926731682921, "grad_norm": 0.9298506379127502, "learning_rate": 2e-05, "loss": 0.7009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4330, "tokens_per_second_per_gpu": 17028.4, "total_tokens": 427622688 }, { "epoch": 0.2707551887971993, "grad_norm": 0.9465619921684265, "learning_rate": 2e-05, "loss": 0.6608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4331, "tokens_per_second_per_gpu": 16588.77, "total_tokens": 427716338 }, { "epoch": 0.27081770442610653, "grad_norm": 0.9476547837257385, "learning_rate": 2e-05, "loss": 0.6866, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4332, "tokens_per_second_per_gpu": 18242.46, "total_tokens": 427813559 }, { "epoch": 0.2708802200550138, "grad_norm": 0.9038105010986328, "learning_rate": 2e-05, "loss": 0.6761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4333, "tokens_per_second_per_gpu": 16868.63, "total_tokens": 427911800 }, { "epoch": 0.27094273568392097, "grad_norm": 0.89694744348526, "learning_rate": 2e-05, "loss": 0.7168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4334, "tokens_per_second_per_gpu": 17167.1, "total_tokens": 428012791 }, { "epoch": 0.2710052513128282, "grad_norm": 0.8498625159263611, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4335, "tokens_per_second_per_gpu": 17291.49, "total_tokens": 428113664 }, { "epoch": 0.27106776694173546, "grad_norm": 1.0208953619003296, "learning_rate": 2e-05, "loss": 0.7023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4336, "tokens_per_second_per_gpu": 16018.69, "total_tokens": 428208819 }, { "epoch": 0.27113028257064264, "grad_norm": 0.9370517730712891, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4337, "tokens_per_second_per_gpu": 16054.58, "total_tokens": 428302389 }, { "epoch": 0.2711927981995499, "grad_norm": 0.9112246036529541, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4338, "tokens_per_second_per_gpu": 16163.53, "total_tokens": 428399705 }, { "epoch": 0.27125531382845713, "grad_norm": 0.9306641817092896, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4339, "tokens_per_second_per_gpu": 15924.87, "total_tokens": 428494075 }, { "epoch": 0.2713178294573643, "grad_norm": 0.9223154187202454, "learning_rate": 2e-05, "loss": 0.6758, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4340, "tokens_per_second_per_gpu": 17104.23, "total_tokens": 428590053 }, { "epoch": 0.27138034508627157, "grad_norm": 0.8961399793624878, "learning_rate": 2e-05, "loss": 0.7188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4341, "tokens_per_second_per_gpu": 16469.03, "total_tokens": 428688804 }, { "epoch": 0.2714428607151788, "grad_norm": 0.9184200763702393, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4342, "tokens_per_second_per_gpu": 17148.25, "total_tokens": 428787434 }, { "epoch": 0.271505376344086, "grad_norm": 0.9783278703689575, "learning_rate": 2e-05, "loss": 0.6657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4343, "tokens_per_second_per_gpu": 16379.09, "total_tokens": 428882536 }, { "epoch": 0.27156789197299325, "grad_norm": 0.9234119057655334, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4344, "tokens_per_second_per_gpu": 16739.6, "total_tokens": 428977523 }, { "epoch": 0.2716304076019005, "grad_norm": 0.8775956034660339, "learning_rate": 2e-05, "loss": 0.6773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4345, "tokens_per_second_per_gpu": 18161.59, "total_tokens": 429077761 }, { "epoch": 0.2716929232308077, "grad_norm": 0.9160425662994385, "learning_rate": 2e-05, "loss": 0.6992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4346, "tokens_per_second_per_gpu": 16764.47, "total_tokens": 429175442 }, { "epoch": 0.2717554388597149, "grad_norm": 0.861752986907959, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4347, "tokens_per_second_per_gpu": 17198.1, "total_tokens": 429272551 }, { "epoch": 0.27181795448862217, "grad_norm": 0.8995417356491089, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4348, "tokens_per_second_per_gpu": 16449.81, "total_tokens": 429372479 }, { "epoch": 0.27188047011752936, "grad_norm": 0.9095370769500732, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4349, "tokens_per_second_per_gpu": 17137.87, "total_tokens": 429466231 }, { "epoch": 0.2719429857464366, "grad_norm": 0.8765286803245544, "learning_rate": 2e-05, "loss": 0.6773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4350, "tokens_per_second_per_gpu": 16777.02, "total_tokens": 429563183 }, { "epoch": 0.27200550137534385, "grad_norm": 0.8417704701423645, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4351, "tokens_per_second_per_gpu": 17875.36, "total_tokens": 429665160 }, { "epoch": 0.27206801700425104, "grad_norm": 0.8968270421028137, "learning_rate": 2e-05, "loss": 0.731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4352, "tokens_per_second_per_gpu": 17355.52, "total_tokens": 429765988 }, { "epoch": 0.2721305326331583, "grad_norm": 0.9261389374732971, "learning_rate": 2e-05, "loss": 0.7218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4353, "tokens_per_second_per_gpu": 17669.14, "total_tokens": 429868026 }, { "epoch": 0.27219304826206553, "grad_norm": 0.8975645899772644, "learning_rate": 2e-05, "loss": 0.6946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4354, "tokens_per_second_per_gpu": 18362.27, "total_tokens": 429969593 }, { "epoch": 0.2722555638909727, "grad_norm": 0.9218024015426636, "learning_rate": 2e-05, "loss": 0.6814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4355, "tokens_per_second_per_gpu": 17780.61, "total_tokens": 430068786 }, { "epoch": 0.27231807951987996, "grad_norm": 0.912869393825531, "learning_rate": 2e-05, "loss": 0.6851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4356, "tokens_per_second_per_gpu": 17268.95, "total_tokens": 430168785 }, { "epoch": 0.2723805951487872, "grad_norm": 0.9061055183410645, "learning_rate": 2e-05, "loss": 0.7345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4357, "tokens_per_second_per_gpu": 18083.98, "total_tokens": 430270161 }, { "epoch": 0.2724431107776944, "grad_norm": 0.9000739455223083, "learning_rate": 2e-05, "loss": 0.6684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4358, "tokens_per_second_per_gpu": 16624.69, "total_tokens": 430366096 }, { "epoch": 0.27250562640660164, "grad_norm": 0.9004300832748413, "learning_rate": 2e-05, "loss": 0.6933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4359, "tokens_per_second_per_gpu": 18364.58, "total_tokens": 430472241 }, { "epoch": 0.2725681420355089, "grad_norm": 0.8840014338493347, "learning_rate": 2e-05, "loss": 0.6716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4360, "tokens_per_second_per_gpu": 17455.32, "total_tokens": 430570401 }, { "epoch": 0.27263065766441613, "grad_norm": 0.8949708938598633, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4361, "tokens_per_second_per_gpu": 17526.75, "total_tokens": 430666301 }, { "epoch": 0.2726931732933233, "grad_norm": 0.889354407787323, "learning_rate": 2e-05, "loss": 0.6818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4362, "tokens_per_second_per_gpu": 16995.85, "total_tokens": 430766482 }, { "epoch": 0.27275568892223057, "grad_norm": 0.8605020642280579, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4363, "tokens_per_second_per_gpu": 16658.64, "total_tokens": 430864667 }, { "epoch": 0.2728182045511378, "grad_norm": 0.8725132346153259, "learning_rate": 2e-05, "loss": 0.6841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4364, "tokens_per_second_per_gpu": 18112.86, "total_tokens": 430967186 }, { "epoch": 0.272880720180045, "grad_norm": 0.8950891494750977, "learning_rate": 2e-05, "loss": 0.673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4365, "tokens_per_second_per_gpu": 17824.38, "total_tokens": 431069439 }, { "epoch": 0.27294323580895224, "grad_norm": 0.919255793094635, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4366, "tokens_per_second_per_gpu": 15879.66, "total_tokens": 431161162 }, { "epoch": 0.2730057514378595, "grad_norm": 0.9393294453620911, "learning_rate": 2e-05, "loss": 0.7074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4367, "tokens_per_second_per_gpu": 15883.31, "total_tokens": 431255827 }, { "epoch": 0.2730682670667667, "grad_norm": 0.9087764024734497, "learning_rate": 2e-05, "loss": 0.6837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4368, "tokens_per_second_per_gpu": 17085.83, "total_tokens": 431354414 }, { "epoch": 0.2731307826956739, "grad_norm": 0.8947122097015381, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4369, "tokens_per_second_per_gpu": 17193.29, "total_tokens": 431453019 }, { "epoch": 0.27319329832458117, "grad_norm": 0.8941083550453186, "learning_rate": 2e-05, "loss": 0.6938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4370, "tokens_per_second_per_gpu": 17438.86, "total_tokens": 431554123 }, { "epoch": 0.27325581395348836, "grad_norm": 0.9429088830947876, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4371, "tokens_per_second_per_gpu": 16353.3, "total_tokens": 431648252 }, { "epoch": 0.2733183295823956, "grad_norm": 1.0242456197738647, "learning_rate": 2e-05, "loss": 0.7142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4372, "tokens_per_second_per_gpu": 17038.12, "total_tokens": 431748941 }, { "epoch": 0.27338084521130285, "grad_norm": 0.9197298884391785, "learning_rate": 2e-05, "loss": 0.6671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4373, "tokens_per_second_per_gpu": 17751.54, "total_tokens": 431850581 }, { "epoch": 0.27344336084021004, "grad_norm": 0.9090333580970764, "learning_rate": 2e-05, "loss": 0.7181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4374, "tokens_per_second_per_gpu": 18038.25, "total_tokens": 431955137 }, { "epoch": 0.2735058764691173, "grad_norm": 0.9366812705993652, "learning_rate": 2e-05, "loss": 0.6731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4375, "tokens_per_second_per_gpu": 17350.6, "total_tokens": 432054549 }, { "epoch": 0.2735683920980245, "grad_norm": 0.9094204902648926, "learning_rate": 2e-05, "loss": 0.6943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4376, "tokens_per_second_per_gpu": 17654.99, "total_tokens": 432156444 }, { "epoch": 0.2736309077269317, "grad_norm": 0.9014629125595093, "learning_rate": 2e-05, "loss": 0.7082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4377, "tokens_per_second_per_gpu": 16338.0, "total_tokens": 432254500 }, { "epoch": 0.27369342335583896, "grad_norm": 0.8897238969802856, "learning_rate": 2e-05, "loss": 0.663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4378, "tokens_per_second_per_gpu": 16300.91, "total_tokens": 432350874 }, { "epoch": 0.2737559389847462, "grad_norm": 0.9144517183303833, "learning_rate": 2e-05, "loss": 0.6986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4379, "tokens_per_second_per_gpu": 17453.99, "total_tokens": 432447868 }, { "epoch": 0.2738184546136534, "grad_norm": 0.8899585008621216, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4380, "tokens_per_second_per_gpu": 17282.2, "total_tokens": 432544427 }, { "epoch": 0.27388097024256064, "grad_norm": 0.9078272581100464, "learning_rate": 2e-05, "loss": 0.7261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4381, "tokens_per_second_per_gpu": 18179.89, "total_tokens": 432648224 }, { "epoch": 0.2739434858714679, "grad_norm": 0.9082324504852295, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4382, "tokens_per_second_per_gpu": 17117.38, "total_tokens": 432747466 }, { "epoch": 0.2740060015003751, "grad_norm": 0.882368266582489, "learning_rate": 2e-05, "loss": 0.6478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4383, "tokens_per_second_per_gpu": 17868.14, "total_tokens": 432844122 }, { "epoch": 0.2740685171292823, "grad_norm": 0.9052348732948303, "learning_rate": 2e-05, "loss": 0.6762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4384, "tokens_per_second_per_gpu": 16710.83, "total_tokens": 432942349 }, { "epoch": 0.27413103275818956, "grad_norm": 0.9795336127281189, "learning_rate": 2e-05, "loss": 0.6658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4385, "tokens_per_second_per_gpu": 17822.3, "total_tokens": 433039329 }, { "epoch": 0.27419354838709675, "grad_norm": 0.9036949872970581, "learning_rate": 2e-05, "loss": 0.6596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4386, "tokens_per_second_per_gpu": 16114.58, "total_tokens": 433136137 }, { "epoch": 0.274256064016004, "grad_norm": 0.8914546370506287, "learning_rate": 2e-05, "loss": 0.6805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4387, "tokens_per_second_per_gpu": 16464.98, "total_tokens": 433231436 }, { "epoch": 0.27431857964491124, "grad_norm": 0.9596858620643616, "learning_rate": 2e-05, "loss": 0.7199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4388, "tokens_per_second_per_gpu": 17601.61, "total_tokens": 433332963 }, { "epoch": 0.27438109527381843, "grad_norm": 0.9267081618309021, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4389, "tokens_per_second_per_gpu": 16032.08, "total_tokens": 433427273 }, { "epoch": 0.2744436109027257, "grad_norm": 0.8969563841819763, "learning_rate": 2e-05, "loss": 0.7232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4390, "tokens_per_second_per_gpu": 17389.55, "total_tokens": 433526607 }, { "epoch": 0.2745061265316329, "grad_norm": 0.9653720259666443, "learning_rate": 2e-05, "loss": 0.6914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4391, "tokens_per_second_per_gpu": 17071.96, "total_tokens": 433626941 }, { "epoch": 0.2745686421605401, "grad_norm": 0.9059284925460815, "learning_rate": 2e-05, "loss": 0.6634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4392, "tokens_per_second_per_gpu": 17545.66, "total_tokens": 433725130 }, { "epoch": 0.27463115778944736, "grad_norm": 0.9047160148620605, "learning_rate": 2e-05, "loss": 0.66, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4393, "tokens_per_second_per_gpu": 19190.01, "total_tokens": 433828449 }, { "epoch": 0.2746936734183546, "grad_norm": 0.945035457611084, "learning_rate": 2e-05, "loss": 0.6893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4394, "tokens_per_second_per_gpu": 15882.67, "total_tokens": 433922639 }, { "epoch": 0.2747561890472618, "grad_norm": 0.9214465022087097, "learning_rate": 2e-05, "loss": 0.6737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4395, "tokens_per_second_per_gpu": 18427.49, "total_tokens": 434024935 }, { "epoch": 0.27481870467616903, "grad_norm": 0.910364031791687, "learning_rate": 2e-05, "loss": 0.7091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4396, "tokens_per_second_per_gpu": 18298.08, "total_tokens": 434123412 }, { "epoch": 0.2748812203050763, "grad_norm": 0.962637186050415, "learning_rate": 2e-05, "loss": 0.6634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4397, "tokens_per_second_per_gpu": 17613.72, "total_tokens": 434221820 }, { "epoch": 0.27494373593398347, "grad_norm": 0.9340899586677551, "learning_rate": 2e-05, "loss": 0.6614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4398, "tokens_per_second_per_gpu": 17600.69, "total_tokens": 434322020 }, { "epoch": 0.2750062515628907, "grad_norm": 0.9065322875976562, "learning_rate": 2e-05, "loss": 0.7141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4399, "tokens_per_second_per_gpu": 17101.41, "total_tokens": 434421603 }, { "epoch": 0.27506876719179796, "grad_norm": 0.8939502835273743, "learning_rate": 2e-05, "loss": 0.7142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4400, "tokens_per_second_per_gpu": 17275.96, "total_tokens": 434522427 }, { "epoch": 0.2751312828207052, "grad_norm": 0.88838791847229, "learning_rate": 2e-05, "loss": 0.6827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4401, "tokens_per_second_per_gpu": 17509.14, "total_tokens": 434621400 }, { "epoch": 0.2751937984496124, "grad_norm": 0.9742452502250671, "learning_rate": 2e-05, "loss": 0.6679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4402, "tokens_per_second_per_gpu": 16893.62, "total_tokens": 434721386 }, { "epoch": 0.27525631407851964, "grad_norm": 0.8923633694648743, "learning_rate": 2e-05, "loss": 0.7249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4403, "tokens_per_second_per_gpu": 18828.49, "total_tokens": 434826083 }, { "epoch": 0.2753188297074269, "grad_norm": 0.8644447922706604, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4404, "tokens_per_second_per_gpu": 17902.01, "total_tokens": 434924233 }, { "epoch": 0.27538134533633407, "grad_norm": 0.8477816581726074, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4405, "tokens_per_second_per_gpu": 17196.31, "total_tokens": 435022400 }, { "epoch": 0.2754438609652413, "grad_norm": 0.9248831272125244, "learning_rate": 2e-05, "loss": 0.7019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4406, "tokens_per_second_per_gpu": 16890.18, "total_tokens": 435116475 }, { "epoch": 0.27550637659414856, "grad_norm": 0.9289442300796509, "learning_rate": 2e-05, "loss": 0.7237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4407, "tokens_per_second_per_gpu": 17569.1, "total_tokens": 435217446 }, { "epoch": 0.27556889222305575, "grad_norm": 0.9008487462997437, "learning_rate": 2e-05, "loss": 0.7193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4408, "tokens_per_second_per_gpu": 18868.05, "total_tokens": 435323360 }, { "epoch": 0.275631407851963, "grad_norm": 0.8781098127365112, "learning_rate": 2e-05, "loss": 0.6963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4409, "tokens_per_second_per_gpu": 17318.25, "total_tokens": 435427063 }, { "epoch": 0.27569392348087024, "grad_norm": 0.8939477205276489, "learning_rate": 2e-05, "loss": 0.715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4410, "tokens_per_second_per_gpu": 17972.55, "total_tokens": 435527374 }, { "epoch": 0.27575643910977743, "grad_norm": 0.8780567049980164, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4411, "tokens_per_second_per_gpu": 17291.5, "total_tokens": 435627526 }, { "epoch": 0.2758189547386847, "grad_norm": 0.9420671463012695, "learning_rate": 2e-05, "loss": 0.6966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4412, "tokens_per_second_per_gpu": 17927.53, "total_tokens": 435731736 }, { "epoch": 0.2758814703675919, "grad_norm": 0.923517107963562, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4413, "tokens_per_second_per_gpu": 16800.9, "total_tokens": 435830426 }, { "epoch": 0.2759439859964991, "grad_norm": 0.9596720933914185, "learning_rate": 2e-05, "loss": 0.7077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4414, "tokens_per_second_per_gpu": 16197.31, "total_tokens": 435927864 }, { "epoch": 0.27600650162540635, "grad_norm": 0.8724410533905029, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4415, "tokens_per_second_per_gpu": 17104.69, "total_tokens": 436022880 }, { "epoch": 0.2760690172543136, "grad_norm": 0.9530271291732788, "learning_rate": 2e-05, "loss": 0.7163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4416, "tokens_per_second_per_gpu": 17575.7, "total_tokens": 436125205 }, { "epoch": 0.2761315328832208, "grad_norm": 0.9450022578239441, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4417, "tokens_per_second_per_gpu": 17287.13, "total_tokens": 436225403 }, { "epoch": 0.27619404851212803, "grad_norm": 0.8793268799781799, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4418, "tokens_per_second_per_gpu": 17226.19, "total_tokens": 436325362 }, { "epoch": 0.2762565641410353, "grad_norm": 0.9093902111053467, "learning_rate": 2e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4419, "tokens_per_second_per_gpu": 18078.47, "total_tokens": 436422530 }, { "epoch": 0.27631907976994247, "grad_norm": 0.8736867308616638, "learning_rate": 2e-05, "loss": 0.6794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4420, "tokens_per_second_per_gpu": 17187.28, "total_tokens": 436520988 }, { "epoch": 0.2763815953988497, "grad_norm": 0.9157578945159912, "learning_rate": 2e-05, "loss": 0.6793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4421, "tokens_per_second_per_gpu": 17544.95, "total_tokens": 436619264 }, { "epoch": 0.27644411102775696, "grad_norm": 0.9323681592941284, "learning_rate": 2e-05, "loss": 0.7498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4422, "tokens_per_second_per_gpu": 16978.59, "total_tokens": 436717588 }, { "epoch": 0.27650662665666415, "grad_norm": 0.9265204668045044, "learning_rate": 2e-05, "loss": 0.6701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4423, "tokens_per_second_per_gpu": 16905.58, "total_tokens": 436811014 }, { "epoch": 0.2765691422855714, "grad_norm": 0.9297134876251221, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4424, "tokens_per_second_per_gpu": 16818.99, "total_tokens": 436909041 }, { "epoch": 0.27663165791447863, "grad_norm": 0.9773001074790955, "learning_rate": 2e-05, "loss": 0.6646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4425, "tokens_per_second_per_gpu": 17085.66, "total_tokens": 437006573 }, { "epoch": 0.2766941735433858, "grad_norm": 0.8992312550544739, "learning_rate": 2e-05, "loss": 0.6791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4426, "tokens_per_second_per_gpu": 17436.09, "total_tokens": 437104389 }, { "epoch": 0.27675668917229307, "grad_norm": 0.98764568567276, "learning_rate": 2e-05, "loss": 0.6968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4427, "tokens_per_second_per_gpu": 18038.04, "total_tokens": 437205735 }, { "epoch": 0.2768192048012003, "grad_norm": 0.920536994934082, "learning_rate": 2e-05, "loss": 0.6864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4428, "tokens_per_second_per_gpu": 18065.57, "total_tokens": 437307116 }, { "epoch": 0.2768817204301075, "grad_norm": 0.9670096039772034, "learning_rate": 2e-05, "loss": 0.6725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4429, "tokens_per_second_per_gpu": 17046.68, "total_tokens": 437405201 }, { "epoch": 0.27694423605901475, "grad_norm": 0.8892179131507874, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4430, "tokens_per_second_per_gpu": 16892.21, "total_tokens": 437502958 }, { "epoch": 0.277006751687922, "grad_norm": 0.8732394576072693, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4431, "tokens_per_second_per_gpu": 15699.09, "total_tokens": 437597721 }, { "epoch": 0.2770692673168292, "grad_norm": 0.9049885869026184, "learning_rate": 2e-05, "loss": 0.6657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4432, "tokens_per_second_per_gpu": 15938.31, "total_tokens": 437691775 }, { "epoch": 0.2771317829457364, "grad_norm": 8.27344799041748, "learning_rate": 2e-05, "loss": 0.711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4433, "tokens_per_second_per_gpu": 17301.77, "total_tokens": 437791330 }, { "epoch": 0.27719429857464367, "grad_norm": 0.9225283861160278, "learning_rate": 2e-05, "loss": 0.656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4434, "tokens_per_second_per_gpu": 16616.96, "total_tokens": 437889988 }, { "epoch": 0.27725681420355086, "grad_norm": 0.8972092270851135, "learning_rate": 2e-05, "loss": 0.6912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4435, "tokens_per_second_per_gpu": 18179.4, "total_tokens": 437993396 }, { "epoch": 0.2773193298324581, "grad_norm": 0.9175832867622375, "learning_rate": 2e-05, "loss": 0.6896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4436, "tokens_per_second_per_gpu": 17607.6, "total_tokens": 438090845 }, { "epoch": 0.27738184546136535, "grad_norm": 0.9191313982009888, "learning_rate": 2e-05, "loss": 0.6947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4437, "tokens_per_second_per_gpu": 16168.27, "total_tokens": 438186722 }, { "epoch": 0.2774443610902726, "grad_norm": 0.8817815780639648, "learning_rate": 2e-05, "loss": 0.6635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4438, "tokens_per_second_per_gpu": 17167.54, "total_tokens": 438283482 }, { "epoch": 0.2775068767191798, "grad_norm": 0.9155944585800171, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4439, "tokens_per_second_per_gpu": 17015.36, "total_tokens": 438380888 }, { "epoch": 0.27756939234808703, "grad_norm": 0.9607711434364319, "learning_rate": 2e-05, "loss": 0.7269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4440, "tokens_per_second_per_gpu": 17679.7, "total_tokens": 438482027 }, { "epoch": 0.2776319079769943, "grad_norm": 0.8697248697280884, "learning_rate": 2e-05, "loss": 0.703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4441, "tokens_per_second_per_gpu": 18691.11, "total_tokens": 438584256 }, { "epoch": 0.27769442360590146, "grad_norm": 0.9431037902832031, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4442, "tokens_per_second_per_gpu": 15451.34, "total_tokens": 438679928 }, { "epoch": 0.2777569392348087, "grad_norm": 0.9068710803985596, "learning_rate": 2e-05, "loss": 0.6737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4443, "tokens_per_second_per_gpu": 18442.53, "total_tokens": 438784999 }, { "epoch": 0.27781945486371595, "grad_norm": 0.8644813299179077, "learning_rate": 2e-05, "loss": 0.6807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4444, "tokens_per_second_per_gpu": 18005.51, "total_tokens": 438886942 }, { "epoch": 0.27788197049262314, "grad_norm": 0.8753665685653687, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4445, "tokens_per_second_per_gpu": 16534.05, "total_tokens": 438984163 }, { "epoch": 0.2779444861215304, "grad_norm": 0.9012671113014221, "learning_rate": 2e-05, "loss": 0.7076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4446, "tokens_per_second_per_gpu": 17104.88, "total_tokens": 439086053 }, { "epoch": 0.27800700175043763, "grad_norm": 0.9079716801643372, "learning_rate": 2e-05, "loss": 0.6964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4447, "tokens_per_second_per_gpu": 17412.5, "total_tokens": 439184183 }, { "epoch": 0.2780695173793448, "grad_norm": 0.8894321918487549, "learning_rate": 2e-05, "loss": 0.6987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4448, "tokens_per_second_per_gpu": 17442.64, "total_tokens": 439284314 }, { "epoch": 0.27813203300825207, "grad_norm": 0.9292716979980469, "learning_rate": 2e-05, "loss": 0.6931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4449, "tokens_per_second_per_gpu": 16718.71, "total_tokens": 439383400 }, { "epoch": 0.2781945486371593, "grad_norm": 0.8971027135848999, "learning_rate": 2e-05, "loss": 0.7253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4450, "tokens_per_second_per_gpu": 17787.48, "total_tokens": 439484613 }, { "epoch": 0.2782570642660665, "grad_norm": 0.9051946401596069, "learning_rate": 2e-05, "loss": 0.6763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4451, "tokens_per_second_per_gpu": 16784.04, "total_tokens": 439585998 }, { "epoch": 0.27831957989497375, "grad_norm": 0.9525728821754456, "learning_rate": 2e-05, "loss": 0.6824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4452, "tokens_per_second_per_gpu": 17255.42, "total_tokens": 439682357 }, { "epoch": 0.278382095523881, "grad_norm": 0.8945310711860657, "learning_rate": 2e-05, "loss": 0.7069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4453, "tokens_per_second_per_gpu": 18506.91, "total_tokens": 439784538 }, { "epoch": 0.2784446111527882, "grad_norm": 0.9439619779586792, "learning_rate": 2e-05, "loss": 0.7141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4454, "tokens_per_second_per_gpu": 17644.37, "total_tokens": 439884106 }, { "epoch": 0.2785071267816954, "grad_norm": 0.9034668207168579, "learning_rate": 2e-05, "loss": 0.6716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4455, "tokens_per_second_per_gpu": 17543.55, "total_tokens": 439982978 }, { "epoch": 0.27856964241060267, "grad_norm": 0.9079296588897705, "learning_rate": 2e-05, "loss": 0.6912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4456, "tokens_per_second_per_gpu": 18492.71, "total_tokens": 440086184 }, { "epoch": 0.27863215803950986, "grad_norm": 0.8810790777206421, "learning_rate": 2e-05, "loss": 0.6784, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4457, "tokens_per_second_per_gpu": 18291.49, "total_tokens": 440190160 }, { "epoch": 0.2786946736684171, "grad_norm": 0.8820377588272095, "learning_rate": 2e-05, "loss": 0.6833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4458, "tokens_per_second_per_gpu": 18687.53, "total_tokens": 440291576 }, { "epoch": 0.27875718929732435, "grad_norm": 0.9150700569152832, "learning_rate": 2e-05, "loss": 0.6921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4459, "tokens_per_second_per_gpu": 18358.07, "total_tokens": 440393896 }, { "epoch": 0.27881970492623154, "grad_norm": 0.8919683694839478, "learning_rate": 2e-05, "loss": 0.6677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4460, "tokens_per_second_per_gpu": 17862.18, "total_tokens": 440496558 }, { "epoch": 0.2788822205551388, "grad_norm": 0.8991852402687073, "learning_rate": 2e-05, "loss": 0.716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4461, "tokens_per_second_per_gpu": 17713.64, "total_tokens": 440600251 }, { "epoch": 0.278944736184046, "grad_norm": 0.9247109889984131, "learning_rate": 2e-05, "loss": 0.6859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4462, "tokens_per_second_per_gpu": 15901.35, "total_tokens": 440694264 }, { "epoch": 0.2790072518129532, "grad_norm": 0.9331235885620117, "learning_rate": 2e-05, "loss": 0.69, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4463, "tokens_per_second_per_gpu": 17257.55, "total_tokens": 440796015 }, { "epoch": 0.27906976744186046, "grad_norm": 0.9156697392463684, "learning_rate": 2e-05, "loss": 0.7265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4464, "tokens_per_second_per_gpu": 18177.94, "total_tokens": 440899723 }, { "epoch": 0.2791322830707677, "grad_norm": 0.8801332116127014, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4465, "tokens_per_second_per_gpu": 16681.99, "total_tokens": 440996875 }, { "epoch": 0.2791947986996749, "grad_norm": 0.909305214881897, "learning_rate": 2e-05, "loss": 0.6912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4466, "tokens_per_second_per_gpu": 17204.46, "total_tokens": 441094172 }, { "epoch": 0.27925731432858214, "grad_norm": 0.9218716025352478, "learning_rate": 2e-05, "loss": 0.6813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4467, "tokens_per_second_per_gpu": 16914.78, "total_tokens": 441190600 }, { "epoch": 0.2793198299574894, "grad_norm": 0.9516894221305847, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4468, "tokens_per_second_per_gpu": 16800.76, "total_tokens": 441287980 }, { "epoch": 0.2793823455863966, "grad_norm": 0.9129638671875, "learning_rate": 2e-05, "loss": 0.6971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4469, "tokens_per_second_per_gpu": 17530.68, "total_tokens": 441385060 }, { "epoch": 0.2794448612153038, "grad_norm": 0.8505317568778992, "learning_rate": 2e-05, "loss": 0.6855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4470, "tokens_per_second_per_gpu": 18039.37, "total_tokens": 441487682 }, { "epoch": 0.27950737684421106, "grad_norm": 0.8787261843681335, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4471, "tokens_per_second_per_gpu": 17751.95, "total_tokens": 441586043 }, { "epoch": 0.27956989247311825, "grad_norm": 0.8875812292098999, "learning_rate": 2e-05, "loss": 0.7129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4472, "tokens_per_second_per_gpu": 16924.04, "total_tokens": 441686904 }, { "epoch": 0.2796324081020255, "grad_norm": 1.0023276805877686, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4473, "tokens_per_second_per_gpu": 16790.06, "total_tokens": 441783690 }, { "epoch": 0.27969492373093274, "grad_norm": 0.9264691472053528, "learning_rate": 2e-05, "loss": 0.6857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4474, "tokens_per_second_per_gpu": 17044.57, "total_tokens": 441880650 }, { "epoch": 0.27975743935983993, "grad_norm": 0.888350248336792, "learning_rate": 2e-05, "loss": 0.6779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4475, "tokens_per_second_per_gpu": 17384.38, "total_tokens": 441977737 }, { "epoch": 0.2798199549887472, "grad_norm": 0.8706294298171997, "learning_rate": 2e-05, "loss": 0.6602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4476, "tokens_per_second_per_gpu": 18096.01, "total_tokens": 442080354 }, { "epoch": 0.2798824706176544, "grad_norm": 0.9277985692024231, "learning_rate": 2e-05, "loss": 0.6692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4477, "tokens_per_second_per_gpu": 15518.36, "total_tokens": 442172641 }, { "epoch": 0.27994498624656167, "grad_norm": 0.9762477278709412, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4478, "tokens_per_second_per_gpu": 16621.97, "total_tokens": 442266589 }, { "epoch": 0.28000750187546886, "grad_norm": 0.8844097256660461, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4479, "tokens_per_second_per_gpu": 17624.0, "total_tokens": 442365390 }, { "epoch": 0.2800700175043761, "grad_norm": 0.88573157787323, "learning_rate": 2e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4480, "tokens_per_second_per_gpu": 17947.91, "total_tokens": 442466512 }, { "epoch": 0.28013253313328335, "grad_norm": 0.9433166980743408, "learning_rate": 2e-05, "loss": 0.727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4481, "tokens_per_second_per_gpu": 17763.51, "total_tokens": 442567306 }, { "epoch": 0.28019504876219054, "grad_norm": 0.9272828102111816, "learning_rate": 2e-05, "loss": 0.7063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4482, "tokens_per_second_per_gpu": 16818.99, "total_tokens": 442668067 }, { "epoch": 0.2802575643910978, "grad_norm": 0.8965269327163696, "learning_rate": 2e-05, "loss": 0.6975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4483, "tokens_per_second_per_gpu": 17507.87, "total_tokens": 442767115 }, { "epoch": 0.280320080020005, "grad_norm": 0.9357346892356873, "learning_rate": 2e-05, "loss": 0.6825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4484, "tokens_per_second_per_gpu": 17255.59, "total_tokens": 442865002 }, { "epoch": 0.2803825956489122, "grad_norm": 0.8942099213600159, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4485, "tokens_per_second_per_gpu": 17611.27, "total_tokens": 442958088 }, { "epoch": 0.28044511127781946, "grad_norm": 0.8948230743408203, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4486, "tokens_per_second_per_gpu": 17209.58, "total_tokens": 443055690 }, { "epoch": 0.2805076269067267, "grad_norm": 0.9013861417770386, "learning_rate": 2e-05, "loss": 0.7152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4487, "tokens_per_second_per_gpu": 16851.45, "total_tokens": 443157250 }, { "epoch": 0.2805701425356339, "grad_norm": 0.8724909424781799, "learning_rate": 2e-05, "loss": 0.6473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4488, "tokens_per_second_per_gpu": 17126.91, "total_tokens": 443256467 }, { "epoch": 0.28063265816454114, "grad_norm": 0.9423294067382812, "learning_rate": 2e-05, "loss": 0.7001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4489, "tokens_per_second_per_gpu": 16693.75, "total_tokens": 443352766 }, { "epoch": 0.2806951737934484, "grad_norm": 0.9368667006492615, "learning_rate": 2e-05, "loss": 0.7163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4490, "tokens_per_second_per_gpu": 16507.39, "total_tokens": 443448183 }, { "epoch": 0.28075768942235557, "grad_norm": 0.8943200707435608, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4491, "tokens_per_second_per_gpu": 16564.24, "total_tokens": 443545484 }, { "epoch": 0.2808202050512628, "grad_norm": 0.8851093649864197, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4492, "tokens_per_second_per_gpu": 18275.48, "total_tokens": 443646718 }, { "epoch": 0.28088272068017006, "grad_norm": 0.9198023676872253, "learning_rate": 2e-05, "loss": 0.6932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4493, "tokens_per_second_per_gpu": 16771.16, "total_tokens": 443742285 }, { "epoch": 0.28094523630907725, "grad_norm": 0.9405249357223511, "learning_rate": 2e-05, "loss": 0.7352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4494, "tokens_per_second_per_gpu": 17551.5, "total_tokens": 443839766 }, { "epoch": 0.2810077519379845, "grad_norm": 0.9570439457893372, "learning_rate": 2e-05, "loss": 0.7169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4495, "tokens_per_second_per_gpu": 18875.73, "total_tokens": 443943999 }, { "epoch": 0.28107026756689174, "grad_norm": 0.9384526610374451, "learning_rate": 2e-05, "loss": 0.7483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4496, "tokens_per_second_per_gpu": 18651.72, "total_tokens": 444048567 }, { "epoch": 0.28113278319579893, "grad_norm": 0.9183316826820374, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4497, "tokens_per_second_per_gpu": 16936.61, "total_tokens": 444145378 }, { "epoch": 0.2811952988247062, "grad_norm": 0.9230997562408447, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4498, "tokens_per_second_per_gpu": 15975.26, "total_tokens": 444239681 }, { "epoch": 0.2812578144536134, "grad_norm": 0.9209164381027222, "learning_rate": 2e-05, "loss": 0.6428, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4499, "tokens_per_second_per_gpu": 16730.63, "total_tokens": 444332198 }, { "epoch": 0.2813203300825206, "grad_norm": 0.8722959756851196, "learning_rate": 2e-05, "loss": 0.7229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4500, "tokens_per_second_per_gpu": 17965.78, "total_tokens": 444433298 }, { "epoch": 0.28138284571142785, "grad_norm": 0.888556718826294, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4501, "tokens_per_second_per_gpu": 17695.8, "total_tokens": 444531510 }, { "epoch": 0.2814453613403351, "grad_norm": 0.901879072189331, "learning_rate": 2e-05, "loss": 0.6961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4502, "tokens_per_second_per_gpu": 17009.1, "total_tokens": 444632480 }, { "epoch": 0.2815078769692423, "grad_norm": 0.9198864102363586, "learning_rate": 2e-05, "loss": 0.6896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4503, "tokens_per_second_per_gpu": 17433.21, "total_tokens": 444730781 }, { "epoch": 0.28157039259814953, "grad_norm": 0.9179699420928955, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4504, "tokens_per_second_per_gpu": 16846.25, "total_tokens": 444828603 }, { "epoch": 0.2816329082270568, "grad_norm": 0.9137625694274902, "learning_rate": 2e-05, "loss": 0.684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4505, "tokens_per_second_per_gpu": 17113.86, "total_tokens": 444925101 }, { "epoch": 0.28169542385596397, "grad_norm": 0.8886056542396545, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4506, "tokens_per_second_per_gpu": 17077.98, "total_tokens": 445024721 }, { "epoch": 0.2817579394848712, "grad_norm": 0.9033915996551514, "learning_rate": 2e-05, "loss": 0.6772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4507, "tokens_per_second_per_gpu": 16062.49, "total_tokens": 445120019 }, { "epoch": 0.28182045511377846, "grad_norm": 0.8759021759033203, "learning_rate": 2e-05, "loss": 0.6925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4508, "tokens_per_second_per_gpu": 17840.12, "total_tokens": 445219369 }, { "epoch": 0.28188297074268565, "grad_norm": 0.9051985144615173, "learning_rate": 2e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4509, "tokens_per_second_per_gpu": 16880.82, "total_tokens": 445317251 }, { "epoch": 0.2819454863715929, "grad_norm": 0.8763550519943237, "learning_rate": 2e-05, "loss": 0.6788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4510, "tokens_per_second_per_gpu": 17800.57, "total_tokens": 445417473 }, { "epoch": 0.28200800200050014, "grad_norm": 0.9041216969490051, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4511, "tokens_per_second_per_gpu": 17650.06, "total_tokens": 445515201 }, { "epoch": 0.2820705176294073, "grad_norm": 0.8856586217880249, "learning_rate": 2e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4512, "tokens_per_second_per_gpu": 17271.86, "total_tokens": 445617508 }, { "epoch": 0.28213303325831457, "grad_norm": 0.9210460186004639, "learning_rate": 2e-05, "loss": 0.6814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4513, "tokens_per_second_per_gpu": 16416.6, "total_tokens": 445710546 }, { "epoch": 0.2821955488872218, "grad_norm": 0.9338050484657288, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4514, "tokens_per_second_per_gpu": 17468.76, "total_tokens": 445807117 }, { "epoch": 0.28225806451612906, "grad_norm": 0.8982973694801331, "learning_rate": 2e-05, "loss": 0.7101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4515, "tokens_per_second_per_gpu": 17354.56, "total_tokens": 445906549 }, { "epoch": 0.28232058014503625, "grad_norm": 0.9172009825706482, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4516, "tokens_per_second_per_gpu": 17307.91, "total_tokens": 446006323 }, { "epoch": 0.2823830957739435, "grad_norm": 0.8797212839126587, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4517, "tokens_per_second_per_gpu": 17527.1, "total_tokens": 446106841 }, { "epoch": 0.28244561140285074, "grad_norm": 0.8466433882713318, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4518, "tokens_per_second_per_gpu": 17253.55, "total_tokens": 446208819 }, { "epoch": 0.2825081270317579, "grad_norm": 0.9005940556526184, "learning_rate": 2e-05, "loss": 0.6704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4519, "tokens_per_second_per_gpu": 18628.26, "total_tokens": 446308936 }, { "epoch": 0.28257064266066517, "grad_norm": 0.8869487643241882, "learning_rate": 2e-05, "loss": 0.7366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4520, "tokens_per_second_per_gpu": 18075.06, "total_tokens": 446412827 }, { "epoch": 0.2826331582895724, "grad_norm": 0.9416405558586121, "learning_rate": 2e-05, "loss": 0.6763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4521, "tokens_per_second_per_gpu": 17238.89, "total_tokens": 446512593 }, { "epoch": 0.2826956739184796, "grad_norm": 0.8938503861427307, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4522, "tokens_per_second_per_gpu": 17196.49, "total_tokens": 446609652 }, { "epoch": 0.28275818954738685, "grad_norm": 0.919592559337616, "learning_rate": 2e-05, "loss": 0.7141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4523, "tokens_per_second_per_gpu": 17651.76, "total_tokens": 446709641 }, { "epoch": 0.2828207051762941, "grad_norm": 0.8599246144294739, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4524, "tokens_per_second_per_gpu": 17623.12, "total_tokens": 446811655 }, { "epoch": 0.2828832208052013, "grad_norm": 0.8609855771064758, "learning_rate": 2e-05, "loss": 0.652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4525, "tokens_per_second_per_gpu": 17508.26, "total_tokens": 446913357 }, { "epoch": 0.28294573643410853, "grad_norm": 0.986647367477417, "learning_rate": 2e-05, "loss": 0.7267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4526, "tokens_per_second_per_gpu": 16443.88, "total_tokens": 447011273 }, { "epoch": 0.2830082520630158, "grad_norm": 0.9031490683555603, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4527, "tokens_per_second_per_gpu": 16382.92, "total_tokens": 447105330 }, { "epoch": 0.28307076769192296, "grad_norm": 0.9534333944320679, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4528, "tokens_per_second_per_gpu": 17454.12, "total_tokens": 447200292 }, { "epoch": 0.2831332833208302, "grad_norm": 0.922755777835846, "learning_rate": 2e-05, "loss": 0.6816, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4529, "tokens_per_second_per_gpu": 16920.57, "total_tokens": 447302285 }, { "epoch": 0.28319579894973745, "grad_norm": 0.944061279296875, "learning_rate": 2e-05, "loss": 0.6963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4530, "tokens_per_second_per_gpu": 17351.32, "total_tokens": 447397474 }, { "epoch": 0.28325831457864464, "grad_norm": 0.9132556319236755, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4531, "tokens_per_second_per_gpu": 16570.58, "total_tokens": 447494812 }, { "epoch": 0.2833208302075519, "grad_norm": 0.933510422706604, "learning_rate": 2e-05, "loss": 0.6969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4532, "tokens_per_second_per_gpu": 17906.68, "total_tokens": 447595387 }, { "epoch": 0.28338334583645913, "grad_norm": 0.9611882567405701, "learning_rate": 2e-05, "loss": 0.7256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4533, "tokens_per_second_per_gpu": 16129.15, "total_tokens": 447691507 }, { "epoch": 0.2834458614653663, "grad_norm": 0.9402456283569336, "learning_rate": 2e-05, "loss": 0.7119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4534, "tokens_per_second_per_gpu": 16471.95, "total_tokens": 447786916 }, { "epoch": 0.28350837709427357, "grad_norm": 0.8985826373100281, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4535, "tokens_per_second_per_gpu": 17963.07, "total_tokens": 447883403 }, { "epoch": 0.2835708927231808, "grad_norm": 0.9612495303153992, "learning_rate": 2e-05, "loss": 0.6874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4536, "tokens_per_second_per_gpu": 17765.89, "total_tokens": 447982609 }, { "epoch": 0.283633408352088, "grad_norm": 0.9279732704162598, "learning_rate": 2e-05, "loss": 0.7117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4537, "tokens_per_second_per_gpu": 18820.01, "total_tokens": 448086584 }, { "epoch": 0.28369592398099525, "grad_norm": 0.926957905292511, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4538, "tokens_per_second_per_gpu": 17781.52, "total_tokens": 448185812 }, { "epoch": 0.2837584396099025, "grad_norm": 0.9473103880882263, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4539, "tokens_per_second_per_gpu": 16392.6, "total_tokens": 448282847 }, { "epoch": 0.2838209552388097, "grad_norm": 0.8837048411369324, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4540, "tokens_per_second_per_gpu": 17892.43, "total_tokens": 448385877 }, { "epoch": 0.2838834708677169, "grad_norm": 0.9361984729766846, "learning_rate": 2e-05, "loss": 0.7077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4541, "tokens_per_second_per_gpu": 16799.44, "total_tokens": 448483960 }, { "epoch": 0.28394598649662417, "grad_norm": 0.8746396899223328, "learning_rate": 2e-05, "loss": 0.6667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4542, "tokens_per_second_per_gpu": 17725.94, "total_tokens": 448584708 }, { "epoch": 0.28400850212553136, "grad_norm": 0.8822532892227173, "learning_rate": 2e-05, "loss": 0.7339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4543, "tokens_per_second_per_gpu": 18254.8, "total_tokens": 448688822 }, { "epoch": 0.2840710177544386, "grad_norm": 0.9110077619552612, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4544, "tokens_per_second_per_gpu": 17559.97, "total_tokens": 448788493 }, { "epoch": 0.28413353338334585, "grad_norm": 0.9325916171073914, "learning_rate": 2e-05, "loss": 0.6998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4545, "tokens_per_second_per_gpu": 17889.03, "total_tokens": 448887352 }, { "epoch": 0.28419604901225304, "grad_norm": 0.9022032022476196, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4546, "tokens_per_second_per_gpu": 16593.34, "total_tokens": 448981039 }, { "epoch": 0.2842585646411603, "grad_norm": 0.9223612546920776, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4547, "tokens_per_second_per_gpu": 16597.33, "total_tokens": 449079169 }, { "epoch": 0.2843210802700675, "grad_norm": 0.9265344738960266, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4548, "tokens_per_second_per_gpu": 17365.96, "total_tokens": 449177752 }, { "epoch": 0.2843835958989747, "grad_norm": 0.9362469911575317, "learning_rate": 2e-05, "loss": 0.6872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4549, "tokens_per_second_per_gpu": 16813.92, "total_tokens": 449277538 }, { "epoch": 0.28444611152788196, "grad_norm": 0.9055923819541931, "learning_rate": 2e-05, "loss": 0.6973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4550, "tokens_per_second_per_gpu": 17084.58, "total_tokens": 449377430 }, { "epoch": 0.2845086271567892, "grad_norm": 0.9138521552085876, "learning_rate": 2e-05, "loss": 0.6945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4551, "tokens_per_second_per_gpu": 17513.47, "total_tokens": 449478008 }, { "epoch": 0.28457114278569645, "grad_norm": 0.849443256855011, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4552, "tokens_per_second_per_gpu": 19147.01, "total_tokens": 449582258 }, { "epoch": 0.28463365841460364, "grad_norm": 0.9008749127388, "learning_rate": 2e-05, "loss": 0.692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4553, "tokens_per_second_per_gpu": 17490.25, "total_tokens": 449683105 }, { "epoch": 0.2846961740435109, "grad_norm": 0.9922040700912476, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4554, "tokens_per_second_per_gpu": 16512.58, "total_tokens": 449774211 }, { "epoch": 0.28475868967241813, "grad_norm": 0.9175394177436829, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4555, "tokens_per_second_per_gpu": 16688.66, "total_tokens": 449869463 }, { "epoch": 0.2848212053013253, "grad_norm": 0.9088776111602783, "learning_rate": 2e-05, "loss": 0.6868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4556, "tokens_per_second_per_gpu": 17616.1, "total_tokens": 449965253 }, { "epoch": 0.28488372093023256, "grad_norm": 0.9023658037185669, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4557, "tokens_per_second_per_gpu": 17115.7, "total_tokens": 450064397 }, { "epoch": 0.2849462365591398, "grad_norm": 0.8858731389045715, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4558, "tokens_per_second_per_gpu": 17114.81, "total_tokens": 450164477 }, { "epoch": 0.285008752188047, "grad_norm": 10.513154029846191, "learning_rate": 2e-05, "loss": 0.6951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4559, "tokens_per_second_per_gpu": 17744.13, "total_tokens": 450264622 }, { "epoch": 0.28507126781695424, "grad_norm": 0.8963179588317871, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4560, "tokens_per_second_per_gpu": 17397.88, "total_tokens": 450365505 }, { "epoch": 0.2851337834458615, "grad_norm": 0.9761759638786316, "learning_rate": 2e-05, "loss": 0.6914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4561, "tokens_per_second_per_gpu": 17040.23, "total_tokens": 450462989 }, { "epoch": 0.2851962990747687, "grad_norm": 0.9932974576950073, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4562, "tokens_per_second_per_gpu": 17649.06, "total_tokens": 450561141 }, { "epoch": 0.2852588147036759, "grad_norm": 0.9364892840385437, "learning_rate": 2e-05, "loss": 0.7157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4563, "tokens_per_second_per_gpu": 17716.93, "total_tokens": 450659682 }, { "epoch": 0.28532133033258317, "grad_norm": 0.8862721920013428, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4564, "tokens_per_second_per_gpu": 18190.83, "total_tokens": 450763062 }, { "epoch": 0.28538384596149036, "grad_norm": 0.875055730342865, "learning_rate": 2e-05, "loss": 0.6835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4565, "tokens_per_second_per_gpu": 16808.89, "total_tokens": 450862392 }, { "epoch": 0.2854463615903976, "grad_norm": 0.8779868483543396, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4566, "tokens_per_second_per_gpu": 17555.13, "total_tokens": 450959882 }, { "epoch": 0.28550887721930485, "grad_norm": 0.9210218787193298, "learning_rate": 2e-05, "loss": 0.6726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4567, "tokens_per_second_per_gpu": 17559.99, "total_tokens": 451057919 }, { "epoch": 0.28557139284821204, "grad_norm": 0.889734148979187, "learning_rate": 2e-05, "loss": 0.6925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4568, "tokens_per_second_per_gpu": 16578.04, "total_tokens": 451153755 }, { "epoch": 0.2856339084771193, "grad_norm": 0.9200543761253357, "learning_rate": 2e-05, "loss": 0.6766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4569, "tokens_per_second_per_gpu": 17603.62, "total_tokens": 451251034 }, { "epoch": 0.2856964241060265, "grad_norm": 0.8547821044921875, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4570, "tokens_per_second_per_gpu": 17304.31, "total_tokens": 451352333 }, { "epoch": 0.2857589397349337, "grad_norm": 0.8998305797576904, "learning_rate": 2e-05, "loss": 0.6938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4571, "tokens_per_second_per_gpu": 18378.52, "total_tokens": 451452056 }, { "epoch": 0.28582145536384096, "grad_norm": 0.9130528569221497, "learning_rate": 2e-05, "loss": 0.7169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4572, "tokens_per_second_per_gpu": 18608.77, "total_tokens": 451553015 }, { "epoch": 0.2858839709927482, "grad_norm": 0.9021922945976257, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4573, "tokens_per_second_per_gpu": 18045.47, "total_tokens": 451654015 }, { "epoch": 0.2859464866216554, "grad_norm": 0.8442277908325195, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4574, "tokens_per_second_per_gpu": 18483.43, "total_tokens": 451757259 }, { "epoch": 0.28600900225056264, "grad_norm": 0.9002619981765747, "learning_rate": 2e-05, "loss": 0.6683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4575, "tokens_per_second_per_gpu": 18164.78, "total_tokens": 451858714 }, { "epoch": 0.2860715178794699, "grad_norm": 0.8695197701454163, "learning_rate": 2e-05, "loss": 0.6969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4576, "tokens_per_second_per_gpu": 18789.52, "total_tokens": 451963606 }, { "epoch": 0.2861340335083771, "grad_norm": 0.8671974539756775, "learning_rate": 2e-05, "loss": 0.6772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4577, "tokens_per_second_per_gpu": 17278.9, "total_tokens": 452063087 }, { "epoch": 0.2861965491372843, "grad_norm": 0.9237339496612549, "learning_rate": 2e-05, "loss": 0.6768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4578, "tokens_per_second_per_gpu": 17844.04, "total_tokens": 452156611 }, { "epoch": 0.28625906476619156, "grad_norm": 0.8739302158355713, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4579, "tokens_per_second_per_gpu": 17073.74, "total_tokens": 452257153 }, { "epoch": 0.28632158039509875, "grad_norm": 0.884546160697937, "learning_rate": 2e-05, "loss": 0.6948, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4580, "tokens_per_second_per_gpu": 17987.04, "total_tokens": 452357678 }, { "epoch": 0.286384096024006, "grad_norm": 0.8955912590026855, "learning_rate": 2e-05, "loss": 0.6518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4581, "tokens_per_second_per_gpu": 18036.07, "total_tokens": 452457808 }, { "epoch": 0.28644661165291324, "grad_norm": 0.9209423661231995, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4582, "tokens_per_second_per_gpu": 15585.2, "total_tokens": 452551379 }, { "epoch": 0.28650912728182043, "grad_norm": 0.8706819415092468, "learning_rate": 2e-05, "loss": 0.6966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4583, "tokens_per_second_per_gpu": 19112.61, "total_tokens": 452655286 }, { "epoch": 0.2865716429107277, "grad_norm": 0.9322469830513, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4584, "tokens_per_second_per_gpu": 16785.92, "total_tokens": 452750256 }, { "epoch": 0.2866341585396349, "grad_norm": 0.9426923990249634, "learning_rate": 2e-05, "loss": 0.6748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4585, "tokens_per_second_per_gpu": 17973.48, "total_tokens": 452852699 }, { "epoch": 0.2866966741685421, "grad_norm": 0.8842374682426453, "learning_rate": 2e-05, "loss": 0.6598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4586, "tokens_per_second_per_gpu": 17190.44, "total_tokens": 452951422 }, { "epoch": 0.28675918979744935, "grad_norm": 0.8955393433570862, "learning_rate": 2e-05, "loss": 0.7099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4587, "tokens_per_second_per_gpu": 17486.1, "total_tokens": 453049473 }, { "epoch": 0.2868217054263566, "grad_norm": 0.9236106872558594, "learning_rate": 2e-05, "loss": 0.6745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4588, "tokens_per_second_per_gpu": 15424.75, "total_tokens": 453142281 }, { "epoch": 0.2868842210552638, "grad_norm": 0.8950761556625366, "learning_rate": 2e-05, "loss": 0.7076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4589, "tokens_per_second_per_gpu": 17751.41, "total_tokens": 453243413 }, { "epoch": 0.28694673668417103, "grad_norm": 0.924146294593811, "learning_rate": 2e-05, "loss": 0.7211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4590, "tokens_per_second_per_gpu": 17805.6, "total_tokens": 453347373 }, { "epoch": 0.2870092523130783, "grad_norm": 0.9180746674537659, "learning_rate": 2e-05, "loss": 0.7014, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4591, "tokens_per_second_per_gpu": 18007.57, "total_tokens": 453447167 }, { "epoch": 0.2870717679419855, "grad_norm": 0.8698441386222839, "learning_rate": 2e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4592, "tokens_per_second_per_gpu": 17095.05, "total_tokens": 453541689 }, { "epoch": 0.2871342835708927, "grad_norm": 0.888087809085846, "learning_rate": 2e-05, "loss": 0.6986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4593, "tokens_per_second_per_gpu": 16378.34, "total_tokens": 453638929 }, { "epoch": 0.28719679919979996, "grad_norm": 0.9250526428222656, "learning_rate": 2e-05, "loss": 0.6963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4594, "tokens_per_second_per_gpu": 18085.12, "total_tokens": 453737759 }, { "epoch": 0.2872593148287072, "grad_norm": 0.9289258718490601, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4595, "tokens_per_second_per_gpu": 16896.64, "total_tokens": 453836471 }, { "epoch": 0.2873218304576144, "grad_norm": 0.8834477663040161, "learning_rate": 2e-05, "loss": 0.6847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4596, "tokens_per_second_per_gpu": 17796.55, "total_tokens": 453934196 }, { "epoch": 0.28738434608652164, "grad_norm": 0.880374014377594, "learning_rate": 2e-05, "loss": 0.7024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4597, "tokens_per_second_per_gpu": 17974.02, "total_tokens": 454033719 }, { "epoch": 0.2874468617154289, "grad_norm": 0.8914144039154053, "learning_rate": 2e-05, "loss": 0.6669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4598, "tokens_per_second_per_gpu": 16727.35, "total_tokens": 454129753 }, { "epoch": 0.28750937734433607, "grad_norm": 0.8820226192474365, "learning_rate": 2e-05, "loss": 0.6887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4599, "tokens_per_second_per_gpu": 17086.82, "total_tokens": 454229515 }, { "epoch": 0.2875718929732433, "grad_norm": 0.9060834646224976, "learning_rate": 2e-05, "loss": 0.6741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4600, "tokens_per_second_per_gpu": 17748.58, "total_tokens": 454328997 }, { "epoch": 0.28763440860215056, "grad_norm": 0.8796157836914062, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4601, "tokens_per_second_per_gpu": 17768.96, "total_tokens": 454426887 }, { "epoch": 0.28769692423105775, "grad_norm": 0.9031727313995361, "learning_rate": 2e-05, "loss": 0.7108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4602, "tokens_per_second_per_gpu": 17361.06, "total_tokens": 454526012 }, { "epoch": 0.287759439859965, "grad_norm": 0.9641749262809753, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4603, "tokens_per_second_per_gpu": 16694.05, "total_tokens": 454622335 }, { "epoch": 0.28782195548887224, "grad_norm": 0.9207668900489807, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4604, "tokens_per_second_per_gpu": 17942.58, "total_tokens": 454722719 }, { "epoch": 0.28788447111777943, "grad_norm": 0.911717414855957, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4605, "tokens_per_second_per_gpu": 17041.96, "total_tokens": 454818023 }, { "epoch": 0.2879469867466867, "grad_norm": 0.9024289846420288, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4606, "tokens_per_second_per_gpu": 16643.92, "total_tokens": 454913856 }, { "epoch": 0.2880095023755939, "grad_norm": 0.9207079410552979, "learning_rate": 2e-05, "loss": 0.6919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4607, "tokens_per_second_per_gpu": 17671.0, "total_tokens": 455016788 }, { "epoch": 0.2880720180045011, "grad_norm": 0.9095269441604614, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4608, "tokens_per_second_per_gpu": 17024.69, "total_tokens": 455114059 }, { "epoch": 0.28813453363340835, "grad_norm": 0.8928118348121643, "learning_rate": 2e-05, "loss": 0.725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4609, "tokens_per_second_per_gpu": 16708.19, "total_tokens": 455215570 }, { "epoch": 0.2881970492623156, "grad_norm": 0.9072527885437012, "learning_rate": 2e-05, "loss": 0.6952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4610, "tokens_per_second_per_gpu": 17724.47, "total_tokens": 455315732 }, { "epoch": 0.2882595648912228, "grad_norm": 0.8491629362106323, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4611, "tokens_per_second_per_gpu": 17048.68, "total_tokens": 455412295 }, { "epoch": 0.28832208052013003, "grad_norm": 0.8761858940124512, "learning_rate": 2e-05, "loss": 0.713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4612, "tokens_per_second_per_gpu": 17741.82, "total_tokens": 455514671 }, { "epoch": 0.2883845961490373, "grad_norm": 0.8895455598831177, "learning_rate": 2e-05, "loss": 0.688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4613, "tokens_per_second_per_gpu": 17494.32, "total_tokens": 455613208 }, { "epoch": 0.28844711177794446, "grad_norm": 0.8875254392623901, "learning_rate": 2e-05, "loss": 0.693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4614, "tokens_per_second_per_gpu": 17407.91, "total_tokens": 455711120 }, { "epoch": 0.2885096274068517, "grad_norm": 0.8925876617431641, "learning_rate": 2e-05, "loss": 0.6819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4615, "tokens_per_second_per_gpu": 16448.31, "total_tokens": 455810023 }, { "epoch": 0.28857214303575895, "grad_norm": 0.8844373226165771, "learning_rate": 2e-05, "loss": 0.6836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4616, "tokens_per_second_per_gpu": 17626.45, "total_tokens": 455911469 }, { "epoch": 0.28863465866466614, "grad_norm": 0.8972629904747009, "learning_rate": 2e-05, "loss": 0.6667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4617, "tokens_per_second_per_gpu": 16794.98, "total_tokens": 456008371 }, { "epoch": 0.2886971742935734, "grad_norm": 0.8536153435707092, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4618, "tokens_per_second_per_gpu": 17176.54, "total_tokens": 456109671 }, { "epoch": 0.28875968992248063, "grad_norm": 0.9125744104385376, "learning_rate": 2e-05, "loss": 0.6658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4619, "tokens_per_second_per_gpu": 15908.88, "total_tokens": 456203395 }, { "epoch": 0.2888222055513878, "grad_norm": 0.909274160861969, "learning_rate": 2e-05, "loss": 0.6818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4620, "tokens_per_second_per_gpu": 17526.23, "total_tokens": 456303071 }, { "epoch": 0.28888472118029507, "grad_norm": 0.8815522193908691, "learning_rate": 2e-05, "loss": 0.6865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4621, "tokens_per_second_per_gpu": 18172.42, "total_tokens": 456404497 }, { "epoch": 0.2889472368092023, "grad_norm": 0.9171164631843567, "learning_rate": 2e-05, "loss": 0.6887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4622, "tokens_per_second_per_gpu": 17028.98, "total_tokens": 456502224 }, { "epoch": 0.2890097524381095, "grad_norm": 0.9271124601364136, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4623, "tokens_per_second_per_gpu": 17091.86, "total_tokens": 456598603 }, { "epoch": 0.28907226806701675, "grad_norm": 0.8989242315292358, "learning_rate": 2e-05, "loss": 0.7225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4624, "tokens_per_second_per_gpu": 18001.97, "total_tokens": 456699862 }, { "epoch": 0.289134783695924, "grad_norm": 0.9048685431480408, "learning_rate": 2e-05, "loss": 0.6882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4625, "tokens_per_second_per_gpu": 18153.49, "total_tokens": 456803934 }, { "epoch": 0.2891972993248312, "grad_norm": 0.9136685132980347, "learning_rate": 2e-05, "loss": 0.6771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4626, "tokens_per_second_per_gpu": 17540.96, "total_tokens": 456901775 }, { "epoch": 0.2892598149537384, "grad_norm": 0.8952257037162781, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4627, "tokens_per_second_per_gpu": 17124.21, "total_tokens": 456998274 }, { "epoch": 0.28932233058264567, "grad_norm": 0.9030811190605164, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4628, "tokens_per_second_per_gpu": 16833.76, "total_tokens": 457093737 }, { "epoch": 0.2893848462115529, "grad_norm": 0.8859223127365112, "learning_rate": 2e-05, "loss": 0.6633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4629, "tokens_per_second_per_gpu": 16420.18, "total_tokens": 457192586 }, { "epoch": 0.2894473618404601, "grad_norm": 0.9005017280578613, "learning_rate": 2e-05, "loss": 0.6956, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4630, "tokens_per_second_per_gpu": 17857.38, "total_tokens": 457292260 }, { "epoch": 0.28950987746936735, "grad_norm": 0.9056865572929382, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4631, "tokens_per_second_per_gpu": 17917.97, "total_tokens": 457391006 }, { "epoch": 0.2895723930982746, "grad_norm": 0.9219943284988403, "learning_rate": 2e-05, "loss": 0.6941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4632, "tokens_per_second_per_gpu": 17642.77, "total_tokens": 457488281 }, { "epoch": 0.2896349087271818, "grad_norm": 0.8804592490196228, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4633, "tokens_per_second_per_gpu": 17468.52, "total_tokens": 457587076 }, { "epoch": 0.28969742435608903, "grad_norm": 0.912562370300293, "learning_rate": 2e-05, "loss": 0.675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4634, "tokens_per_second_per_gpu": 16218.08, "total_tokens": 457681770 }, { "epoch": 0.2897599399849963, "grad_norm": 0.912851095199585, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4635, "tokens_per_second_per_gpu": 17099.96, "total_tokens": 457778920 }, { "epoch": 0.28982245561390346, "grad_norm": 0.8597666621208191, "learning_rate": 2e-05, "loss": 0.6655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4636, "tokens_per_second_per_gpu": 17212.42, "total_tokens": 457880049 }, { "epoch": 0.2898849712428107, "grad_norm": 0.9306007623672485, "learning_rate": 2e-05, "loss": 0.7113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4637, "tokens_per_second_per_gpu": 18245.71, "total_tokens": 457982698 }, { "epoch": 0.28994748687171795, "grad_norm": 0.8682540059089661, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4638, "tokens_per_second_per_gpu": 17105.52, "total_tokens": 458079708 }, { "epoch": 0.29001000250062514, "grad_norm": 0.8986750245094299, "learning_rate": 2e-05, "loss": 0.6943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4639, "tokens_per_second_per_gpu": 17885.01, "total_tokens": 458178794 }, { "epoch": 0.2900725181295324, "grad_norm": 0.9378846883773804, "learning_rate": 2e-05, "loss": 0.6994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4640, "tokens_per_second_per_gpu": 17713.04, "total_tokens": 458278180 }, { "epoch": 0.29013503375843963, "grad_norm": 0.890763521194458, "learning_rate": 2e-05, "loss": 0.6832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4641, "tokens_per_second_per_gpu": 16929.9, "total_tokens": 458375822 }, { "epoch": 0.2901975493873468, "grad_norm": 0.926540195941925, "learning_rate": 2e-05, "loss": 0.6937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4642, "tokens_per_second_per_gpu": 17782.45, "total_tokens": 458475221 }, { "epoch": 0.29026006501625407, "grad_norm": 0.8409102559089661, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4643, "tokens_per_second_per_gpu": 18238.82, "total_tokens": 458577329 }, { "epoch": 0.2903225806451613, "grad_norm": 0.8875606656074524, "learning_rate": 2e-05, "loss": 0.6784, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4644, "tokens_per_second_per_gpu": 17718.59, "total_tokens": 458674684 }, { "epoch": 0.2903850962740685, "grad_norm": 0.9180484414100647, "learning_rate": 2e-05, "loss": 0.7315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4645, "tokens_per_second_per_gpu": 18014.97, "total_tokens": 458775841 }, { "epoch": 0.29044761190297574, "grad_norm": 0.9127359390258789, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4646, "tokens_per_second_per_gpu": 16841.4, "total_tokens": 458868568 }, { "epoch": 0.290510127531883, "grad_norm": 0.9537607431411743, "learning_rate": 2e-05, "loss": 0.7083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4647, "tokens_per_second_per_gpu": 17107.16, "total_tokens": 458964600 }, { "epoch": 0.2905726431607902, "grad_norm": 0.903607964515686, "learning_rate": 2e-05, "loss": 0.6992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4648, "tokens_per_second_per_gpu": 17774.75, "total_tokens": 459067612 }, { "epoch": 0.2906351587896974, "grad_norm": 0.8601586222648621, "learning_rate": 2e-05, "loss": 0.6695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4649, "tokens_per_second_per_gpu": 17745.59, "total_tokens": 459165723 }, { "epoch": 0.29069767441860467, "grad_norm": 0.8629136681556702, "learning_rate": 2e-05, "loss": 0.6716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4650, "tokens_per_second_per_gpu": 18580.63, "total_tokens": 459270816 }, { "epoch": 0.29076019004751186, "grad_norm": 0.8829671740531921, "learning_rate": 2e-05, "loss": 0.6355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4651, "tokens_per_second_per_gpu": 16493.0, "total_tokens": 459367115 }, { "epoch": 0.2908227056764191, "grad_norm": 0.8822529315948486, "learning_rate": 2e-05, "loss": 0.691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4652, "tokens_per_second_per_gpu": 17857.33, "total_tokens": 459468469 }, { "epoch": 0.29088522130532635, "grad_norm": 0.8927502036094666, "learning_rate": 2e-05, "loss": 0.6462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4653, "tokens_per_second_per_gpu": 17166.5, "total_tokens": 459566559 }, { "epoch": 0.29094773693423354, "grad_norm": 0.8590173721313477, "learning_rate": 2e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4654, "tokens_per_second_per_gpu": 17752.56, "total_tokens": 459666655 }, { "epoch": 0.2910102525631408, "grad_norm": 0.8889918923377991, "learning_rate": 2e-05, "loss": 0.6857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4655, "tokens_per_second_per_gpu": 18908.43, "total_tokens": 459767504 }, { "epoch": 0.291072768192048, "grad_norm": 0.8731060028076172, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4656, "tokens_per_second_per_gpu": 15934.04, "total_tokens": 459862401 }, { "epoch": 0.2911352838209552, "grad_norm": 0.8787526488304138, "learning_rate": 2e-05, "loss": 0.6893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4657, "tokens_per_second_per_gpu": 17697.06, "total_tokens": 459964730 }, { "epoch": 0.29119779944986246, "grad_norm": 0.8856022357940674, "learning_rate": 2e-05, "loss": 0.6767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4658, "tokens_per_second_per_gpu": 17034.65, "total_tokens": 460064107 }, { "epoch": 0.2912603150787697, "grad_norm": 0.9394484162330627, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4659, "tokens_per_second_per_gpu": 16564.46, "total_tokens": 460159939 }, { "epoch": 0.2913228307076769, "grad_norm": 0.8816092014312744, "learning_rate": 2e-05, "loss": 0.6934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4660, "tokens_per_second_per_gpu": 18032.46, "total_tokens": 460260994 }, { "epoch": 0.29138534633658414, "grad_norm": 0.9293311834335327, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4661, "tokens_per_second_per_gpu": 16478.9, "total_tokens": 460354647 }, { "epoch": 0.2914478619654914, "grad_norm": 0.9590787291526794, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4662, "tokens_per_second_per_gpu": 18447.77, "total_tokens": 460453558 }, { "epoch": 0.2915103775943986, "grad_norm": 0.8909096717834473, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4663, "tokens_per_second_per_gpu": 16056.4, "total_tokens": 460549199 }, { "epoch": 0.2915728932233058, "grad_norm": 0.8600170016288757, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4664, "tokens_per_second_per_gpu": 17762.06, "total_tokens": 460651048 }, { "epoch": 0.29163540885221306, "grad_norm": 0.8735170364379883, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4665, "tokens_per_second_per_gpu": 17218.07, "total_tokens": 460751044 }, { "epoch": 0.2916979244811203, "grad_norm": 0.9673137068748474, "learning_rate": 2e-05, "loss": 0.6576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4666, "tokens_per_second_per_gpu": 16949.77, "total_tokens": 460845226 }, { "epoch": 0.2917604401100275, "grad_norm": 0.8658584356307983, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4667, "tokens_per_second_per_gpu": 17911.12, "total_tokens": 460945707 }, { "epoch": 0.29182295573893474, "grad_norm": 0.8547970056533813, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4668, "tokens_per_second_per_gpu": 18494.22, "total_tokens": 461043571 }, { "epoch": 0.291885471367842, "grad_norm": 0.8810645937919617, "learning_rate": 2e-05, "loss": 0.7032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4669, "tokens_per_second_per_gpu": 18148.88, "total_tokens": 461147027 }, { "epoch": 0.2919479869967492, "grad_norm": 0.9024587869644165, "learning_rate": 2e-05, "loss": 0.6612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4670, "tokens_per_second_per_gpu": 15192.78, "total_tokens": 461240453 }, { "epoch": 0.2920105026256564, "grad_norm": 0.8799788355827332, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4671, "tokens_per_second_per_gpu": 17262.4, "total_tokens": 461336258 }, { "epoch": 0.29207301825456367, "grad_norm": 0.9096662998199463, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4672, "tokens_per_second_per_gpu": 17244.64, "total_tokens": 461435366 }, { "epoch": 0.29213553388347085, "grad_norm": 0.8941748738288879, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4673, "tokens_per_second_per_gpu": 16866.67, "total_tokens": 461533365 }, { "epoch": 0.2921980495123781, "grad_norm": 0.9907079935073853, "learning_rate": 2e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4674, "tokens_per_second_per_gpu": 18829.04, "total_tokens": 461634484 }, { "epoch": 0.29226056514128534, "grad_norm": 0.8942365646362305, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4675, "tokens_per_second_per_gpu": 17619.26, "total_tokens": 461732896 }, { "epoch": 0.29232308077019253, "grad_norm": 0.9900158047676086, "learning_rate": 2e-05, "loss": 0.6757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4676, "tokens_per_second_per_gpu": 17758.73, "total_tokens": 461827375 }, { "epoch": 0.2923855963990998, "grad_norm": 0.917728066444397, "learning_rate": 2e-05, "loss": 0.699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4677, "tokens_per_second_per_gpu": 17747.31, "total_tokens": 461927267 }, { "epoch": 0.292448112028007, "grad_norm": 0.9940172433853149, "learning_rate": 2e-05, "loss": 0.6987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4678, "tokens_per_second_per_gpu": 17028.41, "total_tokens": 462021108 }, { "epoch": 0.2925106276569142, "grad_norm": 0.8689537048339844, "learning_rate": 2e-05, "loss": 0.6892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4679, "tokens_per_second_per_gpu": 18276.17, "total_tokens": 462124393 }, { "epoch": 0.29257314328582146, "grad_norm": 0.9125019311904907, "learning_rate": 2e-05, "loss": 0.6651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4680, "tokens_per_second_per_gpu": 17196.31, "total_tokens": 462224268 }, { "epoch": 0.2926356589147287, "grad_norm": 0.9100210666656494, "learning_rate": 2e-05, "loss": 0.6577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4681, "tokens_per_second_per_gpu": 17655.02, "total_tokens": 462321389 }, { "epoch": 0.2926981745436359, "grad_norm": 0.9172157049179077, "learning_rate": 2e-05, "loss": 0.6812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4682, "tokens_per_second_per_gpu": 17017.09, "total_tokens": 462420744 }, { "epoch": 0.29276069017254314, "grad_norm": 0.8772896528244019, "learning_rate": 2e-05, "loss": 0.6962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4683, "tokens_per_second_per_gpu": 17689.41, "total_tokens": 462520158 }, { "epoch": 0.2928232058014504, "grad_norm": 0.8847780227661133, "learning_rate": 2e-05, "loss": 0.7215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4684, "tokens_per_second_per_gpu": 18769.12, "total_tokens": 462627907 }, { "epoch": 0.29288572143035757, "grad_norm": 0.8690569996833801, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4685, "tokens_per_second_per_gpu": 17617.03, "total_tokens": 462726117 }, { "epoch": 0.2929482370592648, "grad_norm": 0.8855158090591431, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4686, "tokens_per_second_per_gpu": 17116.3, "total_tokens": 462823900 }, { "epoch": 0.29301075268817206, "grad_norm": 0.9747529625892639, "learning_rate": 2e-05, "loss": 0.6819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4687, "tokens_per_second_per_gpu": 18475.15, "total_tokens": 462923154 }, { "epoch": 0.29307326831707925, "grad_norm": 0.9050012230873108, "learning_rate": 2e-05, "loss": 0.7012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4688, "tokens_per_second_per_gpu": 18164.19, "total_tokens": 463026596 }, { "epoch": 0.2931357839459865, "grad_norm": 0.9572897553443909, "learning_rate": 2e-05, "loss": 0.7409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4689, "tokens_per_second_per_gpu": 16748.95, "total_tokens": 463127506 }, { "epoch": 0.29319829957489374, "grad_norm": 0.9178160429000854, "learning_rate": 2e-05, "loss": 0.6526, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4690, "tokens_per_second_per_gpu": 17284.35, "total_tokens": 463225724 }, { "epoch": 0.29326081520380093, "grad_norm": 0.9258514642715454, "learning_rate": 2e-05, "loss": 0.6989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4691, "tokens_per_second_per_gpu": 17972.92, "total_tokens": 463328050 }, { "epoch": 0.2933233308327082, "grad_norm": 0.8992276191711426, "learning_rate": 2e-05, "loss": 0.6858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4692, "tokens_per_second_per_gpu": 17572.86, "total_tokens": 463427837 }, { "epoch": 0.2933858464616154, "grad_norm": 0.8816856145858765, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4693, "tokens_per_second_per_gpu": 18000.68, "total_tokens": 463530115 }, { "epoch": 0.2934483620905226, "grad_norm": 0.8770127296447754, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4694, "tokens_per_second_per_gpu": 16988.37, "total_tokens": 463625046 }, { "epoch": 0.29351087771942985, "grad_norm": 0.8784949779510498, "learning_rate": 2e-05, "loss": 0.7175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4695, "tokens_per_second_per_gpu": 17051.76, "total_tokens": 463725066 }, { "epoch": 0.2935733933483371, "grad_norm": 0.9513702392578125, "learning_rate": 2e-05, "loss": 0.6829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4696, "tokens_per_second_per_gpu": 16939.04, "total_tokens": 463819387 }, { "epoch": 0.2936359089772443, "grad_norm": 0.860605001449585, "learning_rate": 2e-05, "loss": 0.6576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4697, "tokens_per_second_per_gpu": 17050.51, "total_tokens": 463916722 }, { "epoch": 0.29369842460615153, "grad_norm": 0.9371704459190369, "learning_rate": 2e-05, "loss": 0.7251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4698, "tokens_per_second_per_gpu": 15704.6, "total_tokens": 464008110 }, { "epoch": 0.2937609402350588, "grad_norm": 0.8711753487586975, "learning_rate": 2e-05, "loss": 0.6801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4699, "tokens_per_second_per_gpu": 18455.68, "total_tokens": 464112042 }, { "epoch": 0.29382345586396597, "grad_norm": 0.9272322654724121, "learning_rate": 2e-05, "loss": 0.6783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4700, "tokens_per_second_per_gpu": 17670.45, "total_tokens": 464214122 }, { "epoch": 0.2938859714928732, "grad_norm": 0.897452712059021, "learning_rate": 2e-05, "loss": 0.6881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4701, "tokens_per_second_per_gpu": 17018.67, "total_tokens": 464315183 }, { "epoch": 0.29394848712178046, "grad_norm": 0.9077789783477783, "learning_rate": 2e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4702, "tokens_per_second_per_gpu": 15794.93, "total_tokens": 464410796 }, { "epoch": 0.29401100275068764, "grad_norm": 6.744787216186523, "learning_rate": 2e-05, "loss": 0.7237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4703, "tokens_per_second_per_gpu": 17341.1, "total_tokens": 464509759 }, { "epoch": 0.2940735183795949, "grad_norm": 0.8981118202209473, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4704, "tokens_per_second_per_gpu": 17397.87, "total_tokens": 464605736 }, { "epoch": 0.29413603400850213, "grad_norm": 0.9153979420661926, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4705, "tokens_per_second_per_gpu": 17296.41, "total_tokens": 464703872 }, { "epoch": 0.2941985496374094, "grad_norm": 0.8598496317863464, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4706, "tokens_per_second_per_gpu": 18131.83, "total_tokens": 464804075 }, { "epoch": 0.29426106526631657, "grad_norm": 0.8699204325675964, "learning_rate": 2e-05, "loss": 0.6663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4707, "tokens_per_second_per_gpu": 17952.17, "total_tokens": 464902407 }, { "epoch": 0.2943235808952238, "grad_norm": 0.8764806985855103, "learning_rate": 2e-05, "loss": 0.718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4708, "tokens_per_second_per_gpu": 18472.36, "total_tokens": 465005778 }, { "epoch": 0.29438609652413106, "grad_norm": 0.8986261487007141, "learning_rate": 2e-05, "loss": 0.6731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4709, "tokens_per_second_per_gpu": 17291.85, "total_tokens": 465104884 }, { "epoch": 0.29444861215303825, "grad_norm": 0.9174516797065735, "learning_rate": 2e-05, "loss": 0.7011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4710, "tokens_per_second_per_gpu": 17058.71, "total_tokens": 465201682 }, { "epoch": 0.2945111277819455, "grad_norm": 0.9165505170822144, "learning_rate": 2e-05, "loss": 0.6804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4711, "tokens_per_second_per_gpu": 17776.2, "total_tokens": 465300138 }, { "epoch": 0.29457364341085274, "grad_norm": 0.8867061138153076, "learning_rate": 2e-05, "loss": 0.6982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4712, "tokens_per_second_per_gpu": 17921.0, "total_tokens": 465401492 }, { "epoch": 0.2946361590397599, "grad_norm": 0.95136559009552, "learning_rate": 2e-05, "loss": 0.6817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4713, "tokens_per_second_per_gpu": 17925.75, "total_tokens": 465497985 }, { "epoch": 0.29469867466866717, "grad_norm": 0.9087960720062256, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4714, "tokens_per_second_per_gpu": 17434.39, "total_tokens": 465596143 }, { "epoch": 0.2947611902975744, "grad_norm": 0.8862255215644836, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4715, "tokens_per_second_per_gpu": 17118.16, "total_tokens": 465696317 }, { "epoch": 0.2948237059264816, "grad_norm": 0.9131066203117371, "learning_rate": 2e-05, "loss": 0.6882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4716, "tokens_per_second_per_gpu": 18266.3, "total_tokens": 465795557 }, { "epoch": 0.29488622155538885, "grad_norm": 0.9294102787971497, "learning_rate": 2e-05, "loss": 0.6965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4717, "tokens_per_second_per_gpu": 15252.04, "total_tokens": 465887994 }, { "epoch": 0.2949487371842961, "grad_norm": 0.9125077724456787, "learning_rate": 2e-05, "loss": 0.6948, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4718, "tokens_per_second_per_gpu": 17124.69, "total_tokens": 465985486 }, { "epoch": 0.2950112528132033, "grad_norm": 0.888096034526825, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4719, "tokens_per_second_per_gpu": 18080.14, "total_tokens": 466088307 }, { "epoch": 0.29507376844211053, "grad_norm": 0.9345909357070923, "learning_rate": 2e-05, "loss": 0.6716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4720, "tokens_per_second_per_gpu": 17332.45, "total_tokens": 466186531 }, { "epoch": 0.2951362840710178, "grad_norm": 0.9005416035652161, "learning_rate": 2e-05, "loss": 0.6811, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4721, "tokens_per_second_per_gpu": 17376.01, "total_tokens": 466286065 }, { "epoch": 0.29519879969992496, "grad_norm": 0.8866885304450989, "learning_rate": 2e-05, "loss": 0.6792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4722, "tokens_per_second_per_gpu": 17877.35, "total_tokens": 466384024 }, { "epoch": 0.2952613153288322, "grad_norm": 0.9168701767921448, "learning_rate": 2e-05, "loss": 0.6481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4723, "tokens_per_second_per_gpu": 17283.06, "total_tokens": 466479911 }, { "epoch": 0.29532383095773945, "grad_norm": 0.9196596145629883, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4724, "tokens_per_second_per_gpu": 15685.94, "total_tokens": 466574404 }, { "epoch": 0.29538634658664664, "grad_norm": 0.9334442019462585, "learning_rate": 2e-05, "loss": 0.7001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4725, "tokens_per_second_per_gpu": 18104.22, "total_tokens": 466676121 }, { "epoch": 0.2954488622155539, "grad_norm": 0.9244635105133057, "learning_rate": 2e-05, "loss": 0.7473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4726, "tokens_per_second_per_gpu": 16312.27, "total_tokens": 466774915 }, { "epoch": 0.29551137784446113, "grad_norm": 0.9023314714431763, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4727, "tokens_per_second_per_gpu": 17328.03, "total_tokens": 466876023 }, { "epoch": 0.2955738934733683, "grad_norm": 0.8996488451957703, "learning_rate": 2e-05, "loss": 0.6652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4728, "tokens_per_second_per_gpu": 18193.39, "total_tokens": 466974282 }, { "epoch": 0.29563640910227557, "grad_norm": 0.8879632949829102, "learning_rate": 2e-05, "loss": 0.6996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4729, "tokens_per_second_per_gpu": 17545.98, "total_tokens": 467074965 }, { "epoch": 0.2956989247311828, "grad_norm": 0.9348223209381104, "learning_rate": 2e-05, "loss": 0.7064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4730, "tokens_per_second_per_gpu": 17980.32, "total_tokens": 467177267 }, { "epoch": 0.29576144036009, "grad_norm": 0.9176515936851501, "learning_rate": 2e-05, "loss": 0.6932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4731, "tokens_per_second_per_gpu": 18207.46, "total_tokens": 467278690 }, { "epoch": 0.29582395598899724, "grad_norm": 0.882964551448822, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4732, "tokens_per_second_per_gpu": 17956.62, "total_tokens": 467379248 }, { "epoch": 0.2958864716179045, "grad_norm": 0.8581663370132446, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4733, "tokens_per_second_per_gpu": 17788.93, "total_tokens": 467478790 }, { "epoch": 0.2959489872468117, "grad_norm": 0.8736052513122559, "learning_rate": 2e-05, "loss": 0.6988, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4734, "tokens_per_second_per_gpu": 17818.29, "total_tokens": 467582191 }, { "epoch": 0.2960115028757189, "grad_norm": 0.9485660791397095, "learning_rate": 2e-05, "loss": 0.6607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4735, "tokens_per_second_per_gpu": 17564.15, "total_tokens": 467682694 }, { "epoch": 0.29607401850462617, "grad_norm": 0.9141643047332764, "learning_rate": 2e-05, "loss": 0.6839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4736, "tokens_per_second_per_gpu": 16039.7, "total_tokens": 467782074 }, { "epoch": 0.29613653413353336, "grad_norm": 0.9237189888954163, "learning_rate": 2e-05, "loss": 0.6575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4737, "tokens_per_second_per_gpu": 16737.24, "total_tokens": 467879044 }, { "epoch": 0.2961990497624406, "grad_norm": 0.9025202393531799, "learning_rate": 2e-05, "loss": 0.672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4738, "tokens_per_second_per_gpu": 16559.84, "total_tokens": 467972216 }, { "epoch": 0.29626156539134785, "grad_norm": 0.9007053375244141, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4739, "tokens_per_second_per_gpu": 17025.43, "total_tokens": 468069174 }, { "epoch": 0.29632408102025504, "grad_norm": 0.9053657650947571, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4740, "tokens_per_second_per_gpu": 16792.45, "total_tokens": 468163183 }, { "epoch": 0.2963865966491623, "grad_norm": 0.9359418153762817, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4741, "tokens_per_second_per_gpu": 17674.28, "total_tokens": 468260928 }, { "epoch": 0.2964491122780695, "grad_norm": 0.961052417755127, "learning_rate": 2e-05, "loss": 0.7034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4742, "tokens_per_second_per_gpu": 16869.57, "total_tokens": 468355792 }, { "epoch": 0.29651162790697677, "grad_norm": 0.8894666433334351, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4743, "tokens_per_second_per_gpu": 17568.69, "total_tokens": 468454254 }, { "epoch": 0.29657414353588396, "grad_norm": 0.8725171089172363, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4744, "tokens_per_second_per_gpu": 17744.88, "total_tokens": 468556486 }, { "epoch": 0.2966366591647912, "grad_norm": 0.8966874480247498, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4745, "tokens_per_second_per_gpu": 17428.98, "total_tokens": 468655469 }, { "epoch": 0.29669917479369845, "grad_norm": 0.9535569548606873, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4746, "tokens_per_second_per_gpu": 17262.69, "total_tokens": 468753325 }, { "epoch": 0.29676169042260564, "grad_norm": 0.8786736130714417, "learning_rate": 2e-05, "loss": 0.6602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4747, "tokens_per_second_per_gpu": 17904.18, "total_tokens": 468854104 }, { "epoch": 0.2968242060515129, "grad_norm": 0.9045248031616211, "learning_rate": 2e-05, "loss": 0.7039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4748, "tokens_per_second_per_gpu": 17162.98, "total_tokens": 468951772 }, { "epoch": 0.29688672168042013, "grad_norm": 0.9413173794746399, "learning_rate": 2e-05, "loss": 0.7064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4749, "tokens_per_second_per_gpu": 18561.67, "total_tokens": 469055294 }, { "epoch": 0.2969492373093273, "grad_norm": 0.9048007726669312, "learning_rate": 2e-05, "loss": 0.7086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4750, "tokens_per_second_per_gpu": 17833.74, "total_tokens": 469154099 }, { "epoch": 0.29701175293823456, "grad_norm": 0.9203731417655945, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4751, "tokens_per_second_per_gpu": 17880.64, "total_tokens": 469254531 }, { "epoch": 0.2970742685671418, "grad_norm": 0.9261984825134277, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4752, "tokens_per_second_per_gpu": 17372.07, "total_tokens": 469350954 }, { "epoch": 0.297136784196049, "grad_norm": 0.9106677770614624, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4753, "tokens_per_second_per_gpu": 17947.94, "total_tokens": 469452197 }, { "epoch": 0.29719929982495624, "grad_norm": 0.8938488364219666, "learning_rate": 2e-05, "loss": 0.6536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4754, "tokens_per_second_per_gpu": 17343.12, "total_tokens": 469552103 }, { "epoch": 0.2972618154538635, "grad_norm": 0.904071033000946, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4755, "tokens_per_second_per_gpu": 17551.81, "total_tokens": 469651548 }, { "epoch": 0.2973243310827707, "grad_norm": 0.9475479125976562, "learning_rate": 2e-05, "loss": 0.7096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4756, "tokens_per_second_per_gpu": 16738.28, "total_tokens": 469746406 }, { "epoch": 0.2973868467116779, "grad_norm": 0.8509210348129272, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4757, "tokens_per_second_per_gpu": 17017.64, "total_tokens": 469847164 }, { "epoch": 0.29744936234058517, "grad_norm": 0.8508827090263367, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4758, "tokens_per_second_per_gpu": 17011.94, "total_tokens": 469946768 }, { "epoch": 0.29751187796949236, "grad_norm": 0.9180218577384949, "learning_rate": 2e-05, "loss": 0.7505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4759, "tokens_per_second_per_gpu": 16589.11, "total_tokens": 470043738 }, { "epoch": 0.2975743935983996, "grad_norm": 0.9312283396720886, "learning_rate": 2e-05, "loss": 0.7244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4760, "tokens_per_second_per_gpu": 17711.48, "total_tokens": 470139079 }, { "epoch": 0.29763690922730685, "grad_norm": 1.0270681381225586, "learning_rate": 2e-05, "loss": 0.6552, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4761, "tokens_per_second_per_gpu": 16550.11, "total_tokens": 470231778 }, { "epoch": 0.29769942485621403, "grad_norm": 0.8922838568687439, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4762, "tokens_per_second_per_gpu": 15602.81, "total_tokens": 470324593 }, { "epoch": 0.2977619404851213, "grad_norm": 0.952025830745697, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4763, "tokens_per_second_per_gpu": 16329.61, "total_tokens": 470420336 }, { "epoch": 0.2978244561140285, "grad_norm": 0.8594491481781006, "learning_rate": 2e-05, "loss": 0.6569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4764, "tokens_per_second_per_gpu": 18795.5, "total_tokens": 470523502 }, { "epoch": 0.2978869717429357, "grad_norm": 0.9437939524650574, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4765, "tokens_per_second_per_gpu": 17093.31, "total_tokens": 470619844 }, { "epoch": 0.29794948737184296, "grad_norm": 0.8783046007156372, "learning_rate": 2e-05, "loss": 0.6807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4766, "tokens_per_second_per_gpu": 18306.49, "total_tokens": 470719820 }, { "epoch": 0.2980120030007502, "grad_norm": 0.8958513140678406, "learning_rate": 2e-05, "loss": 0.6881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4767, "tokens_per_second_per_gpu": 17970.13, "total_tokens": 470822717 }, { "epoch": 0.2980745186296574, "grad_norm": 0.8972983956336975, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4768, "tokens_per_second_per_gpu": 17113.23, "total_tokens": 470914863 }, { "epoch": 0.29813703425856464, "grad_norm": 0.8987625241279602, "learning_rate": 2e-05, "loss": 0.6692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4769, "tokens_per_second_per_gpu": 16743.44, "total_tokens": 471011917 }, { "epoch": 0.2981995498874719, "grad_norm": 0.9187043905258179, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4770, "tokens_per_second_per_gpu": 17929.48, "total_tokens": 471111079 }, { "epoch": 0.29826206551637907, "grad_norm": 0.8856166005134583, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4771, "tokens_per_second_per_gpu": 18099.72, "total_tokens": 471212705 }, { "epoch": 0.2983245811452863, "grad_norm": 0.8630372285842896, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4772, "tokens_per_second_per_gpu": 17114.01, "total_tokens": 471313547 }, { "epoch": 0.29838709677419356, "grad_norm": 0.9110393524169922, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4773, "tokens_per_second_per_gpu": 17259.43, "total_tokens": 471409968 }, { "epoch": 0.29844961240310075, "grad_norm": 0.9228318333625793, "learning_rate": 2e-05, "loss": 0.7452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4774, "tokens_per_second_per_gpu": 17231.34, "total_tokens": 471508997 }, { "epoch": 0.298512128032008, "grad_norm": 0.9180659651756287, "learning_rate": 2e-05, "loss": 0.7239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4775, "tokens_per_second_per_gpu": 16818.67, "total_tokens": 471607244 }, { "epoch": 0.29857464366091524, "grad_norm": 0.9122129082679749, "learning_rate": 2e-05, "loss": 0.6823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4776, "tokens_per_second_per_gpu": 16558.31, "total_tokens": 471703599 }, { "epoch": 0.29863715928982243, "grad_norm": 0.8737307786941528, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4777, "tokens_per_second_per_gpu": 17663.22, "total_tokens": 471800543 }, { "epoch": 0.2986996749187297, "grad_norm": 0.9099553227424622, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4778, "tokens_per_second_per_gpu": 16430.96, "total_tokens": 471896671 }, { "epoch": 0.2987621905476369, "grad_norm": 0.903337299823761, "learning_rate": 2e-05, "loss": 0.6698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4779, "tokens_per_second_per_gpu": 17136.36, "total_tokens": 471994914 }, { "epoch": 0.29882470617654416, "grad_norm": 0.8966655135154724, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4780, "tokens_per_second_per_gpu": 16803.58, "total_tokens": 472094744 }, { "epoch": 0.29888722180545135, "grad_norm": 0.927264392375946, "learning_rate": 2e-05, "loss": 0.6783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4781, "tokens_per_second_per_gpu": 17791.94, "total_tokens": 472192467 }, { "epoch": 0.2989497374343586, "grad_norm": 0.871468722820282, "learning_rate": 2e-05, "loss": 0.7075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4782, "tokens_per_second_per_gpu": 18532.21, "total_tokens": 472293484 }, { "epoch": 0.29901225306326584, "grad_norm": 0.892730176448822, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4783, "tokens_per_second_per_gpu": 17299.06, "total_tokens": 472389530 }, { "epoch": 0.29907476869217303, "grad_norm": 0.8902091383934021, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4784, "tokens_per_second_per_gpu": 17455.41, "total_tokens": 472486874 }, { "epoch": 0.2991372843210803, "grad_norm": 0.8885595798492432, "learning_rate": 2e-05, "loss": 0.6518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4785, "tokens_per_second_per_gpu": 16805.33, "total_tokens": 472585810 }, { "epoch": 0.2991997999499875, "grad_norm": 0.9195186495780945, "learning_rate": 2e-05, "loss": 0.7042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4786, "tokens_per_second_per_gpu": 17202.38, "total_tokens": 472686073 }, { "epoch": 0.2992623155788947, "grad_norm": 0.9591551423072815, "learning_rate": 2e-05, "loss": 0.6927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4787, "tokens_per_second_per_gpu": 17199.26, "total_tokens": 472782941 }, { "epoch": 0.29932483120780196, "grad_norm": 0.9320298433303833, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4788, "tokens_per_second_per_gpu": 17005.64, "total_tokens": 472882172 }, { "epoch": 0.2993873468367092, "grad_norm": 0.9079867005348206, "learning_rate": 2e-05, "loss": 0.6772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4789, "tokens_per_second_per_gpu": 16743.57, "total_tokens": 472980096 }, { "epoch": 0.2994498624656164, "grad_norm": 0.9163424968719482, "learning_rate": 2e-05, "loss": 0.6633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4790, "tokens_per_second_per_gpu": 15415.37, "total_tokens": 473070937 }, { "epoch": 0.29951237809452363, "grad_norm": 0.8930070400238037, "learning_rate": 2e-05, "loss": 0.6649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4791, "tokens_per_second_per_gpu": 16572.54, "total_tokens": 473165291 }, { "epoch": 0.2995748937234309, "grad_norm": 0.8765770196914673, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4792, "tokens_per_second_per_gpu": 17028.45, "total_tokens": 473265022 }, { "epoch": 0.29963740935233807, "grad_norm": 0.9302031993865967, "learning_rate": 2e-05, "loss": 0.722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4793, "tokens_per_second_per_gpu": 18137.55, "total_tokens": 473364335 }, { "epoch": 0.2996999249812453, "grad_norm": 0.9068483114242554, "learning_rate": 2e-05, "loss": 0.6894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4794, "tokens_per_second_per_gpu": 17076.28, "total_tokens": 473461923 }, { "epoch": 0.29976244061015256, "grad_norm": 0.9411693811416626, "learning_rate": 2e-05, "loss": 0.7372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4795, "tokens_per_second_per_gpu": 16310.45, "total_tokens": 473560962 }, { "epoch": 0.29982495623905975, "grad_norm": 0.9112347364425659, "learning_rate": 2e-05, "loss": 0.6811, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4796, "tokens_per_second_per_gpu": 17585.51, "total_tokens": 473659647 }, { "epoch": 0.299887471867967, "grad_norm": 0.8945412039756775, "learning_rate": 2e-05, "loss": 0.6709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4797, "tokens_per_second_per_gpu": 17799.02, "total_tokens": 473761165 }, { "epoch": 0.29994998749687424, "grad_norm": 0.8708286881446838, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4798, "tokens_per_second_per_gpu": 17195.96, "total_tokens": 473858636 }, { "epoch": 0.3000125031257814, "grad_norm": 0.9001416563987732, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4799, "tokens_per_second_per_gpu": 16865.71, "total_tokens": 473955753 }, { "epoch": 0.30007501875468867, "grad_norm": 0.875099241733551, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4800, "tokens_per_second_per_gpu": 17016.86, "total_tokens": 474057230 }, { "epoch": 0.3001375343835959, "grad_norm": 0.8720068335533142, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4801, "tokens_per_second_per_gpu": 15697.66, "total_tokens": 474147875 }, { "epoch": 0.3002000500125031, "grad_norm": 0.8968935012817383, "learning_rate": 2e-05, "loss": 0.7346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4802, "tokens_per_second_per_gpu": 18458.26, "total_tokens": 474254606 }, { "epoch": 0.30026256564141035, "grad_norm": 0.8631771802902222, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4803, "tokens_per_second_per_gpu": 17148.25, "total_tokens": 474352812 }, { "epoch": 0.3003250812703176, "grad_norm": 0.8914941549301147, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4804, "tokens_per_second_per_gpu": 16853.29, "total_tokens": 474450313 }, { "epoch": 0.3003875968992248, "grad_norm": 0.9096437692642212, "learning_rate": 2e-05, "loss": 0.6892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4805, "tokens_per_second_per_gpu": 16261.29, "total_tokens": 474547107 }, { "epoch": 0.30045011252813203, "grad_norm": 0.9390829801559448, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4806, "tokens_per_second_per_gpu": 16000.16, "total_tokens": 474641553 }, { "epoch": 0.3005126281570393, "grad_norm": 0.8623851537704468, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4807, "tokens_per_second_per_gpu": 17508.73, "total_tokens": 474739856 }, { "epoch": 0.30057514378594646, "grad_norm": 0.8952809572219849, "learning_rate": 2e-05, "loss": 0.6864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4808, "tokens_per_second_per_gpu": 17989.69, "total_tokens": 474843263 }, { "epoch": 0.3006376594148537, "grad_norm": 0.9129838347434998, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4809, "tokens_per_second_per_gpu": 16732.73, "total_tokens": 474938996 }, { "epoch": 0.30070017504376095, "grad_norm": 0.9182596206665039, "learning_rate": 2e-05, "loss": 0.6696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4810, "tokens_per_second_per_gpu": 16909.57, "total_tokens": 475036439 }, { "epoch": 0.30076269067266814, "grad_norm": 0.9239391684532166, "learning_rate": 2e-05, "loss": 0.6987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4811, "tokens_per_second_per_gpu": 17803.11, "total_tokens": 475139661 }, { "epoch": 0.3008252063015754, "grad_norm": 0.8592514991760254, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4812, "tokens_per_second_per_gpu": 16853.14, "total_tokens": 475241576 }, { "epoch": 0.30088772193048263, "grad_norm": 0.9000786542892456, "learning_rate": 2e-05, "loss": 0.6639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4813, "tokens_per_second_per_gpu": 18365.54, "total_tokens": 475341098 }, { "epoch": 0.3009502375593898, "grad_norm": 0.8964524269104004, "learning_rate": 2e-05, "loss": 0.6821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4814, "tokens_per_second_per_gpu": 17391.33, "total_tokens": 475442526 }, { "epoch": 0.30101275318829707, "grad_norm": 0.891237735748291, "learning_rate": 2e-05, "loss": 0.7164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4815, "tokens_per_second_per_gpu": 18302.31, "total_tokens": 475544210 }, { "epoch": 0.3010752688172043, "grad_norm": 0.9440631866455078, "learning_rate": 2e-05, "loss": 0.6566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4816, "tokens_per_second_per_gpu": 15919.22, "total_tokens": 475637683 }, { "epoch": 0.3011377844461115, "grad_norm": 0.9034432172775269, "learning_rate": 2e-05, "loss": 0.7204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4817, "tokens_per_second_per_gpu": 18329.06, "total_tokens": 475738350 }, { "epoch": 0.30120030007501875, "grad_norm": 0.8830145597457886, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4818, "tokens_per_second_per_gpu": 17842.28, "total_tokens": 475839350 }, { "epoch": 0.301262815703926, "grad_norm": 0.8804383277893066, "learning_rate": 2e-05, "loss": 0.7207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4819, "tokens_per_second_per_gpu": 18014.62, "total_tokens": 475940468 }, { "epoch": 0.30132533133283324, "grad_norm": 0.9027367830276489, "learning_rate": 2e-05, "loss": 0.7004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4820, "tokens_per_second_per_gpu": 17783.32, "total_tokens": 476040088 }, { "epoch": 0.3013878469617404, "grad_norm": 0.9209609627723694, "learning_rate": 2e-05, "loss": 0.702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4821, "tokens_per_second_per_gpu": 18537.8, "total_tokens": 476143096 }, { "epoch": 0.30145036259064767, "grad_norm": 0.8888763785362244, "learning_rate": 2e-05, "loss": 0.7139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4822, "tokens_per_second_per_gpu": 18453.35, "total_tokens": 476247710 }, { "epoch": 0.3015128782195549, "grad_norm": 0.9201468229293823, "learning_rate": 2e-05, "loss": 0.7296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4823, "tokens_per_second_per_gpu": 18230.85, "total_tokens": 476351081 }, { "epoch": 0.3015753938484621, "grad_norm": 0.8452886343002319, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4824, "tokens_per_second_per_gpu": 17839.53, "total_tokens": 476452066 }, { "epoch": 0.30163790947736935, "grad_norm": 0.9174933433532715, "learning_rate": 2e-05, "loss": 0.7222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4825, "tokens_per_second_per_gpu": 17128.24, "total_tokens": 476550785 }, { "epoch": 0.3017004251062766, "grad_norm": 0.8731552958488464, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4826, "tokens_per_second_per_gpu": 17602.12, "total_tokens": 476649206 }, { "epoch": 0.3017629407351838, "grad_norm": 0.8990882635116577, "learning_rate": 2e-05, "loss": 0.6729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4827, "tokens_per_second_per_gpu": 16545.57, "total_tokens": 476747254 }, { "epoch": 0.301825456364091, "grad_norm": 0.9346776008605957, "learning_rate": 2e-05, "loss": 0.6798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4828, "tokens_per_second_per_gpu": 16926.74, "total_tokens": 476843679 }, { "epoch": 0.30188797199299827, "grad_norm": 0.8574193120002747, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4829, "tokens_per_second_per_gpu": 17371.64, "total_tokens": 476944753 }, { "epoch": 0.30195048762190546, "grad_norm": 0.8806803822517395, "learning_rate": 2e-05, "loss": 0.7087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4830, "tokens_per_second_per_gpu": 18517.36, "total_tokens": 477047540 }, { "epoch": 0.3020130032508127, "grad_norm": 0.9133883714675903, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4831, "tokens_per_second_per_gpu": 16204.6, "total_tokens": 477145782 }, { "epoch": 0.30207551887971995, "grad_norm": 0.9005604386329651, "learning_rate": 2e-05, "loss": 0.6582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4832, "tokens_per_second_per_gpu": 17406.32, "total_tokens": 477243925 }, { "epoch": 0.30213803450862714, "grad_norm": 0.8757224678993225, "learning_rate": 2e-05, "loss": 0.6612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4833, "tokens_per_second_per_gpu": 18021.82, "total_tokens": 477344911 }, { "epoch": 0.3022005501375344, "grad_norm": 0.902496337890625, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4834, "tokens_per_second_per_gpu": 15882.02, "total_tokens": 477437159 }, { "epoch": 0.30226306576644163, "grad_norm": 0.9221833348274231, "learning_rate": 2e-05, "loss": 0.6815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4835, "tokens_per_second_per_gpu": 15677.39, "total_tokens": 477528570 }, { "epoch": 0.3023255813953488, "grad_norm": 0.8962406516075134, "learning_rate": 2e-05, "loss": 0.6473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4836, "tokens_per_second_per_gpu": 16635.74, "total_tokens": 477623735 }, { "epoch": 0.30238809702425606, "grad_norm": 0.8756430745124817, "learning_rate": 2e-05, "loss": 0.6516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4837, "tokens_per_second_per_gpu": 17728.6, "total_tokens": 477725278 }, { "epoch": 0.3024506126531633, "grad_norm": 0.8770384788513184, "learning_rate": 2e-05, "loss": 0.6974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4838, "tokens_per_second_per_gpu": 16445.96, "total_tokens": 477825556 }, { "epoch": 0.3025131282820705, "grad_norm": 0.8758072257041931, "learning_rate": 2e-05, "loss": 0.6991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4839, "tokens_per_second_per_gpu": 17955.3, "total_tokens": 477926813 }, { "epoch": 0.30257564391097774, "grad_norm": 0.8817964196205139, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4840, "tokens_per_second_per_gpu": 17573.9, "total_tokens": 478028646 }, { "epoch": 0.302638159539885, "grad_norm": 0.8675298094749451, "learning_rate": 2e-05, "loss": 0.6647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4841, "tokens_per_second_per_gpu": 17073.45, "total_tokens": 478130461 }, { "epoch": 0.3027006751687922, "grad_norm": 0.9089756608009338, "learning_rate": 2e-05, "loss": 0.6994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4842, "tokens_per_second_per_gpu": 17269.82, "total_tokens": 478226191 }, { "epoch": 0.3027631907976994, "grad_norm": 0.8854541182518005, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4843, "tokens_per_second_per_gpu": 18105.58, "total_tokens": 478325392 }, { "epoch": 0.30282570642660667, "grad_norm": 0.9125556349754333, "learning_rate": 2e-05, "loss": 0.7124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4844, "tokens_per_second_per_gpu": 18386.69, "total_tokens": 478427373 }, { "epoch": 0.30288822205551386, "grad_norm": 0.885036289691925, "learning_rate": 2e-05, "loss": 0.6791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4845, "tokens_per_second_per_gpu": 17786.99, "total_tokens": 478526208 }, { "epoch": 0.3029507376844211, "grad_norm": 0.8894561529159546, "learning_rate": 2e-05, "loss": 0.6948, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4846, "tokens_per_second_per_gpu": 17881.3, "total_tokens": 478628753 }, { "epoch": 0.30301325331332835, "grad_norm": 0.9117621183395386, "learning_rate": 2e-05, "loss": 0.6365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4847, "tokens_per_second_per_gpu": 17692.39, "total_tokens": 478727522 }, { "epoch": 0.30307576894223553, "grad_norm": 0.8600981831550598, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4848, "tokens_per_second_per_gpu": 17440.7, "total_tokens": 478829739 }, { "epoch": 0.3031382845711428, "grad_norm": 0.9199194312095642, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4849, "tokens_per_second_per_gpu": 17384.85, "total_tokens": 478931457 }, { "epoch": 0.30320080020005, "grad_norm": 0.8880859017372131, "learning_rate": 2e-05, "loss": 0.6818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4850, "tokens_per_second_per_gpu": 16970.68, "total_tokens": 479029849 }, { "epoch": 0.3032633158289572, "grad_norm": 0.8885531425476074, "learning_rate": 2e-05, "loss": 0.6577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4851, "tokens_per_second_per_gpu": 16873.04, "total_tokens": 479129121 }, { "epoch": 0.30332583145786446, "grad_norm": 0.9004863500595093, "learning_rate": 2e-05, "loss": 0.6957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4852, "tokens_per_second_per_gpu": 16331.06, "total_tokens": 479228919 }, { "epoch": 0.3033883470867717, "grad_norm": 0.9250763654708862, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4853, "tokens_per_second_per_gpu": 16816.93, "total_tokens": 479323599 }, { "epoch": 0.3034508627156789, "grad_norm": 0.9278706312179565, "learning_rate": 2e-05, "loss": 0.692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4854, "tokens_per_second_per_gpu": 17374.21, "total_tokens": 479423413 }, { "epoch": 0.30351337834458614, "grad_norm": 0.9564217925071716, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4855, "tokens_per_second_per_gpu": 17927.81, "total_tokens": 479520497 }, { "epoch": 0.3035758939734934, "grad_norm": 0.9377334117889404, "learning_rate": 2e-05, "loss": 0.7254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4856, "tokens_per_second_per_gpu": 17910.65, "total_tokens": 479618746 }, { "epoch": 0.3036384096024006, "grad_norm": 0.8876716494560242, "learning_rate": 2e-05, "loss": 0.6798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4857, "tokens_per_second_per_gpu": 17392.55, "total_tokens": 479719893 }, { "epoch": 0.3037009252313078, "grad_norm": 0.9912153482437134, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4858, "tokens_per_second_per_gpu": 17444.1, "total_tokens": 479819339 }, { "epoch": 0.30376344086021506, "grad_norm": 0.9584583044052124, "learning_rate": 2e-05, "loss": 0.6967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4859, "tokens_per_second_per_gpu": 16856.18, "total_tokens": 479918769 }, { "epoch": 0.3038259564891223, "grad_norm": 0.9722508192062378, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4860, "tokens_per_second_per_gpu": 16873.34, "total_tokens": 480017399 }, { "epoch": 0.3038884721180295, "grad_norm": 0.9230608940124512, "learning_rate": 2e-05, "loss": 0.6986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4861, "tokens_per_second_per_gpu": 17284.09, "total_tokens": 480119389 }, { "epoch": 0.30395098774693674, "grad_norm": 0.9330025911331177, "learning_rate": 2e-05, "loss": 0.6816, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4862, "tokens_per_second_per_gpu": 17089.05, "total_tokens": 480218685 }, { "epoch": 0.304013503375844, "grad_norm": 0.904222846031189, "learning_rate": 2e-05, "loss": 0.7061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4863, "tokens_per_second_per_gpu": 18407.77, "total_tokens": 480319872 }, { "epoch": 0.3040760190047512, "grad_norm": 0.9265587329864502, "learning_rate": 2e-05, "loss": 0.675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4864, "tokens_per_second_per_gpu": 17341.67, "total_tokens": 480418772 }, { "epoch": 0.3041385346336584, "grad_norm": 0.9210604429244995, "learning_rate": 2e-05, "loss": 0.6716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4865, "tokens_per_second_per_gpu": 17221.0, "total_tokens": 480517797 }, { "epoch": 0.30420105026256566, "grad_norm": 0.854941725730896, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4866, "tokens_per_second_per_gpu": 18698.14, "total_tokens": 480620369 }, { "epoch": 0.30426356589147285, "grad_norm": 0.8306474685668945, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4867, "tokens_per_second_per_gpu": 16750.82, "total_tokens": 480720235 }, { "epoch": 0.3043260815203801, "grad_norm": 0.8755360245704651, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4868, "tokens_per_second_per_gpu": 17818.9, "total_tokens": 480820803 }, { "epoch": 0.30438859714928734, "grad_norm": 0.9309961199760437, "learning_rate": 2e-05, "loss": 0.6745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4869, "tokens_per_second_per_gpu": 17137.55, "total_tokens": 480920600 }, { "epoch": 0.30445111277819453, "grad_norm": 0.8613862991333008, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4870, "tokens_per_second_per_gpu": 18150.04, "total_tokens": 481019706 }, { "epoch": 0.3045136284071018, "grad_norm": 0.9409862756729126, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4871, "tokens_per_second_per_gpu": 18155.83, "total_tokens": 481117325 }, { "epoch": 0.304576144036009, "grad_norm": 0.8664385676383972, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4872, "tokens_per_second_per_gpu": 17421.02, "total_tokens": 481217617 }, { "epoch": 0.3046386596649162, "grad_norm": 0.909436821937561, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4873, "tokens_per_second_per_gpu": 16590.92, "total_tokens": 481309706 }, { "epoch": 0.30470117529382346, "grad_norm": 0.9396708011627197, "learning_rate": 2e-05, "loss": 0.6902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4874, "tokens_per_second_per_gpu": 16199.95, "total_tokens": 481408719 }, { "epoch": 0.3047636909227307, "grad_norm": 0.9168402552604675, "learning_rate": 2e-05, "loss": 0.6988, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4875, "tokens_per_second_per_gpu": 18275.43, "total_tokens": 481509391 }, { "epoch": 0.3048262065516379, "grad_norm": 0.8709501028060913, "learning_rate": 2e-05, "loss": 0.6955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4876, "tokens_per_second_per_gpu": 17835.49, "total_tokens": 481613961 }, { "epoch": 0.30488872218054514, "grad_norm": 0.8819541335105896, "learning_rate": 2e-05, "loss": 0.6972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4877, "tokens_per_second_per_gpu": 18387.97, "total_tokens": 481718201 }, { "epoch": 0.3049512378094524, "grad_norm": 0.9293343424797058, "learning_rate": 2e-05, "loss": 0.7363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4878, "tokens_per_second_per_gpu": 18629.33, "total_tokens": 481820508 }, { "epoch": 0.30501375343835957, "grad_norm": 0.9060975909233093, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4879, "tokens_per_second_per_gpu": 17208.11, "total_tokens": 481920436 }, { "epoch": 0.3050762690672668, "grad_norm": 0.8955007195472717, "learning_rate": 2e-05, "loss": 0.6803, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4880, "tokens_per_second_per_gpu": 16726.19, "total_tokens": 482016058 }, { "epoch": 0.30513878469617406, "grad_norm": 0.8746342062950134, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4881, "tokens_per_second_per_gpu": 17374.81, "total_tokens": 482112124 }, { "epoch": 0.30520130032508125, "grad_norm": 0.9105984568595886, "learning_rate": 2e-05, "loss": 0.6954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4882, "tokens_per_second_per_gpu": 16682.41, "total_tokens": 482205740 }, { "epoch": 0.3052638159539885, "grad_norm": 0.9356640577316284, "learning_rate": 2e-05, "loss": 0.6803, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4883, "tokens_per_second_per_gpu": 17757.54, "total_tokens": 482306366 }, { "epoch": 0.30532633158289574, "grad_norm": 0.9190807342529297, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4884, "tokens_per_second_per_gpu": 16452.17, "total_tokens": 482401252 }, { "epoch": 0.3053888472118029, "grad_norm": 0.8632121682167053, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4885, "tokens_per_second_per_gpu": 17174.44, "total_tokens": 482502837 }, { "epoch": 0.30545136284071017, "grad_norm": 0.9291788935661316, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4886, "tokens_per_second_per_gpu": 16139.76, "total_tokens": 482594426 }, { "epoch": 0.3055138784696174, "grad_norm": 0.9570616483688354, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4887, "tokens_per_second_per_gpu": 16907.29, "total_tokens": 482692099 }, { "epoch": 0.3055763940985246, "grad_norm": 0.9506924152374268, "learning_rate": 2e-05, "loss": 0.6916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4888, "tokens_per_second_per_gpu": 17463.96, "total_tokens": 482791190 }, { "epoch": 0.30563890972743185, "grad_norm": 0.9217967391014099, "learning_rate": 2e-05, "loss": 0.6773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4889, "tokens_per_second_per_gpu": 17058.67, "total_tokens": 482884828 }, { "epoch": 0.3057014253563391, "grad_norm": 0.9450795650482178, "learning_rate": 2e-05, "loss": 0.7233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4890, "tokens_per_second_per_gpu": 17605.37, "total_tokens": 482985109 }, { "epoch": 0.3057639409852463, "grad_norm": 0.9250754714012146, "learning_rate": 2e-05, "loss": 0.7142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4891, "tokens_per_second_per_gpu": 18258.93, "total_tokens": 483086160 }, { "epoch": 0.30582645661415353, "grad_norm": 0.9037228226661682, "learning_rate": 2e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4892, "tokens_per_second_per_gpu": 17628.51, "total_tokens": 483182441 }, { "epoch": 0.3058889722430608, "grad_norm": 0.9330929517745972, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4893, "tokens_per_second_per_gpu": 17147.55, "total_tokens": 483284365 }, { "epoch": 0.30595148787196796, "grad_norm": 0.9203913807868958, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4894, "tokens_per_second_per_gpu": 16466.29, "total_tokens": 483381881 }, { "epoch": 0.3060140035008752, "grad_norm": 0.9115570187568665, "learning_rate": 2e-05, "loss": 0.6759, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4895, "tokens_per_second_per_gpu": 18510.19, "total_tokens": 483484445 }, { "epoch": 0.30607651912978245, "grad_norm": 0.9215420484542847, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4896, "tokens_per_second_per_gpu": 17742.67, "total_tokens": 483580825 }, { "epoch": 0.3061390347586897, "grad_norm": 0.9162130951881409, "learning_rate": 2e-05, "loss": 0.6731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4897, "tokens_per_second_per_gpu": 16762.24, "total_tokens": 483678804 }, { "epoch": 0.3062015503875969, "grad_norm": 0.9107618927955627, "learning_rate": 2e-05, "loss": 0.7344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4898, "tokens_per_second_per_gpu": 17775.31, "total_tokens": 483782687 }, { "epoch": 0.30626406601650413, "grad_norm": 0.9025052189826965, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4899, "tokens_per_second_per_gpu": 17318.64, "total_tokens": 483881241 }, { "epoch": 0.3063265816454114, "grad_norm": 0.9213260412216187, "learning_rate": 2e-05, "loss": 0.6896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4900, "tokens_per_second_per_gpu": 17773.83, "total_tokens": 483981642 }, { "epoch": 0.30638909727431857, "grad_norm": 0.8653709888458252, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4901, "tokens_per_second_per_gpu": 17782.46, "total_tokens": 484079621 }, { "epoch": 0.3064516129032258, "grad_norm": 0.8747435212135315, "learning_rate": 2e-05, "loss": 0.6435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4902, "tokens_per_second_per_gpu": 18102.64, "total_tokens": 484177886 }, { "epoch": 0.30651412853213306, "grad_norm": 0.9025323390960693, "learning_rate": 2e-05, "loss": 0.6857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4903, "tokens_per_second_per_gpu": 16892.5, "total_tokens": 484274998 }, { "epoch": 0.30657664416104025, "grad_norm": 0.9012174010276794, "learning_rate": 2e-05, "loss": 0.7223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4904, "tokens_per_second_per_gpu": 17903.97, "total_tokens": 484377888 }, { "epoch": 0.3066391597899475, "grad_norm": 0.8992982506752014, "learning_rate": 2e-05, "loss": 0.6728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4905, "tokens_per_second_per_gpu": 17983.51, "total_tokens": 484476364 }, { "epoch": 0.30670167541885474, "grad_norm": 0.9566311836242676, "learning_rate": 2e-05, "loss": 0.66, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4906, "tokens_per_second_per_gpu": 16849.58, "total_tokens": 484572890 }, { "epoch": 0.3067641910477619, "grad_norm": 0.9349552989006042, "learning_rate": 2e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4907, "tokens_per_second_per_gpu": 17500.22, "total_tokens": 484671424 }, { "epoch": 0.30682670667666917, "grad_norm": 0.9036012887954712, "learning_rate": 2e-05, "loss": 0.6647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4908, "tokens_per_second_per_gpu": 17359.77, "total_tokens": 484767637 }, { "epoch": 0.3068892223055764, "grad_norm": 0.8956493735313416, "learning_rate": 2e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4909, "tokens_per_second_per_gpu": 16610.62, "total_tokens": 484864515 }, { "epoch": 0.3069517379344836, "grad_norm": 0.8695055246353149, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4910, "tokens_per_second_per_gpu": 17899.4, "total_tokens": 484964727 }, { "epoch": 0.30701425356339085, "grad_norm": 0.9101895093917847, "learning_rate": 2e-05, "loss": 0.6671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4911, "tokens_per_second_per_gpu": 16597.37, "total_tokens": 485062473 }, { "epoch": 0.3070767691922981, "grad_norm": 0.9114022850990295, "learning_rate": 2e-05, "loss": 0.6636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4912, "tokens_per_second_per_gpu": 18092.4, "total_tokens": 485162248 }, { "epoch": 0.3071392848212053, "grad_norm": 0.8978428244590759, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4913, "tokens_per_second_per_gpu": 16769.6, "total_tokens": 485257197 }, { "epoch": 0.3072018004501125, "grad_norm": 0.8739204406738281, "learning_rate": 2e-05, "loss": 0.6737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4914, "tokens_per_second_per_gpu": 17986.38, "total_tokens": 485359903 }, { "epoch": 0.3072643160790198, "grad_norm": 0.8571150302886963, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4915, "tokens_per_second_per_gpu": 17326.21, "total_tokens": 485458733 }, { "epoch": 0.30732683170792696, "grad_norm": 0.9102663397789001, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4916, "tokens_per_second_per_gpu": 17439.87, "total_tokens": 485555975 }, { "epoch": 0.3073893473368342, "grad_norm": 0.8764846324920654, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4917, "tokens_per_second_per_gpu": 16294.25, "total_tokens": 485654791 }, { "epoch": 0.30745186296574145, "grad_norm": 0.9391640424728394, "learning_rate": 2e-05, "loss": 0.7031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4918, "tokens_per_second_per_gpu": 17390.72, "total_tokens": 485753936 }, { "epoch": 0.30751437859464864, "grad_norm": 0.9334320425987244, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4919, "tokens_per_second_per_gpu": 16499.67, "total_tokens": 485848822 }, { "epoch": 0.3075768942235559, "grad_norm": 0.9306652545928955, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4920, "tokens_per_second_per_gpu": 17162.44, "total_tokens": 485949573 }, { "epoch": 0.30763940985246313, "grad_norm": 0.9221125841140747, "learning_rate": 2e-05, "loss": 0.7424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4921, "tokens_per_second_per_gpu": 17115.89, "total_tokens": 486049010 }, { "epoch": 0.3077019254813703, "grad_norm": 0.9106485247612, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4922, "tokens_per_second_per_gpu": 17070.27, "total_tokens": 486150108 }, { "epoch": 0.30776444111027756, "grad_norm": 0.9013525247573853, "learning_rate": 2e-05, "loss": 0.6757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4923, "tokens_per_second_per_gpu": 16732.03, "total_tokens": 486245017 }, { "epoch": 0.3078269567391848, "grad_norm": 0.8981431126594543, "learning_rate": 2e-05, "loss": 0.6839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4924, "tokens_per_second_per_gpu": 17376.71, "total_tokens": 486345034 }, { "epoch": 0.307889472368092, "grad_norm": 0.9257641434669495, "learning_rate": 2e-05, "loss": 0.6945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4925, "tokens_per_second_per_gpu": 17583.99, "total_tokens": 486445538 }, { "epoch": 0.30795198799699924, "grad_norm": 0.8644065856933594, "learning_rate": 2e-05, "loss": 0.648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4926, "tokens_per_second_per_gpu": 17012.64, "total_tokens": 486543756 }, { "epoch": 0.3080145036259065, "grad_norm": 0.9143330454826355, "learning_rate": 2e-05, "loss": 0.7229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4927, "tokens_per_second_per_gpu": 16746.48, "total_tokens": 486642772 }, { "epoch": 0.3080770192548137, "grad_norm": 0.9143404364585876, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4928, "tokens_per_second_per_gpu": 16487.83, "total_tokens": 486740078 }, { "epoch": 0.3081395348837209, "grad_norm": 0.8922734260559082, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4929, "tokens_per_second_per_gpu": 18162.07, "total_tokens": 486838891 }, { "epoch": 0.30820205051262817, "grad_norm": 0.8859297633171082, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4930, "tokens_per_second_per_gpu": 17606.62, "total_tokens": 486937166 }, { "epoch": 0.30826456614153536, "grad_norm": 0.8832114934921265, "learning_rate": 2e-05, "loss": 0.7142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4931, "tokens_per_second_per_gpu": 18672.61, "total_tokens": 487041683 }, { "epoch": 0.3083270817704426, "grad_norm": 0.8627868890762329, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4932, "tokens_per_second_per_gpu": 17928.9, "total_tokens": 487144014 }, { "epoch": 0.30838959739934985, "grad_norm": 0.884354293346405, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4933, "tokens_per_second_per_gpu": 17342.07, "total_tokens": 487244713 }, { "epoch": 0.3084521130282571, "grad_norm": 0.874826967716217, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4934, "tokens_per_second_per_gpu": 17970.92, "total_tokens": 487346718 }, { "epoch": 0.3085146286571643, "grad_norm": 0.9060757756233215, "learning_rate": 2e-05, "loss": 0.6996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4935, "tokens_per_second_per_gpu": 16579.8, "total_tokens": 487446359 }, { "epoch": 0.3085771442860715, "grad_norm": 0.9202418923377991, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4936, "tokens_per_second_per_gpu": 15860.72, "total_tokens": 487540167 }, { "epoch": 0.30863965991497877, "grad_norm": 0.9231186509132385, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4937, "tokens_per_second_per_gpu": 19329.34, "total_tokens": 487640189 }, { "epoch": 0.30870217554388596, "grad_norm": 0.9160084128379822, "learning_rate": 2e-05, "loss": 0.7289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4938, "tokens_per_second_per_gpu": 17752.37, "total_tokens": 487745405 }, { "epoch": 0.3087646911727932, "grad_norm": 0.8944168090820312, "learning_rate": 2e-05, "loss": 0.7024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4939, "tokens_per_second_per_gpu": 16950.01, "total_tokens": 487843979 }, { "epoch": 0.30882720680170045, "grad_norm": 0.8735829591751099, "learning_rate": 2e-05, "loss": 0.6814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4940, "tokens_per_second_per_gpu": 17881.59, "total_tokens": 487947534 }, { "epoch": 0.30888972243060764, "grad_norm": 0.9247249364852905, "learning_rate": 2e-05, "loss": 0.6799, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4941, "tokens_per_second_per_gpu": 17547.61, "total_tokens": 488045141 }, { "epoch": 0.3089522380595149, "grad_norm": 0.8827061057090759, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4942, "tokens_per_second_per_gpu": 17701.82, "total_tokens": 488144456 }, { "epoch": 0.30901475368842213, "grad_norm": 0.9198678731918335, "learning_rate": 2e-05, "loss": 0.6978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4943, "tokens_per_second_per_gpu": 16895.65, "total_tokens": 488245971 }, { "epoch": 0.3090772693173293, "grad_norm": 0.9142024517059326, "learning_rate": 2e-05, "loss": 0.7013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4944, "tokens_per_second_per_gpu": 17862.97, "total_tokens": 488346922 }, { "epoch": 0.30913978494623656, "grad_norm": 0.9181055426597595, "learning_rate": 2e-05, "loss": 0.7012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4945, "tokens_per_second_per_gpu": 17800.02, "total_tokens": 488447807 }, { "epoch": 0.3092023005751438, "grad_norm": 0.914817750453949, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4946, "tokens_per_second_per_gpu": 18384.45, "total_tokens": 488549213 }, { "epoch": 0.309264816204051, "grad_norm": 0.899415135383606, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4947, "tokens_per_second_per_gpu": 17023.89, "total_tokens": 488647298 }, { "epoch": 0.30932733183295824, "grad_norm": 0.9663051962852478, "learning_rate": 2e-05, "loss": 0.6613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4948, "tokens_per_second_per_gpu": 17222.36, "total_tokens": 488746750 }, { "epoch": 0.3093898474618655, "grad_norm": 0.878823459148407, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4949, "tokens_per_second_per_gpu": 17896.49, "total_tokens": 488845348 }, { "epoch": 0.3094523630907727, "grad_norm": 0.9293816089630127, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4950, "tokens_per_second_per_gpu": 16726.67, "total_tokens": 488939023 }, { "epoch": 0.3095148787196799, "grad_norm": 0.9346973299980164, "learning_rate": 2e-05, "loss": 0.674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4951, "tokens_per_second_per_gpu": 16896.04, "total_tokens": 489035108 }, { "epoch": 0.30957739434858716, "grad_norm": 0.9255589246749878, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4952, "tokens_per_second_per_gpu": 17710.51, "total_tokens": 489134012 }, { "epoch": 0.30963990997749435, "grad_norm": 0.8999944925308228, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4953, "tokens_per_second_per_gpu": 17212.95, "total_tokens": 489232923 }, { "epoch": 0.3097024256064016, "grad_norm": 0.9199032783508301, "learning_rate": 2e-05, "loss": 0.6753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4954, "tokens_per_second_per_gpu": 18648.99, "total_tokens": 489334125 }, { "epoch": 0.30976494123530884, "grad_norm": 0.8822091817855835, "learning_rate": 2e-05, "loss": 0.6674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4955, "tokens_per_second_per_gpu": 16626.12, "total_tokens": 489435068 }, { "epoch": 0.30982745686421603, "grad_norm": 0.9062288999557495, "learning_rate": 2e-05, "loss": 0.6535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4956, "tokens_per_second_per_gpu": 16499.29, "total_tokens": 489531291 }, { "epoch": 0.3098899724931233, "grad_norm": 0.8765923380851746, "learning_rate": 2e-05, "loss": 0.6847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4957, "tokens_per_second_per_gpu": 17106.64, "total_tokens": 489632103 }, { "epoch": 0.3099524881220305, "grad_norm": 0.9208858013153076, "learning_rate": 2e-05, "loss": 0.6811, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4958, "tokens_per_second_per_gpu": 16109.19, "total_tokens": 489727649 }, { "epoch": 0.3100150037509377, "grad_norm": 0.9276655912399292, "learning_rate": 2e-05, "loss": 0.7087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4959, "tokens_per_second_per_gpu": 15949.83, "total_tokens": 489825383 }, { "epoch": 0.31007751937984496, "grad_norm": 0.8933209180831909, "learning_rate": 2e-05, "loss": 0.6653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4960, "tokens_per_second_per_gpu": 17705.91, "total_tokens": 489925065 }, { "epoch": 0.3101400350087522, "grad_norm": 0.8428560495376587, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4961, "tokens_per_second_per_gpu": 16735.98, "total_tokens": 490021901 }, { "epoch": 0.3102025506376594, "grad_norm": 0.9078617691993713, "learning_rate": 2e-05, "loss": 0.7065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4962, "tokens_per_second_per_gpu": 17198.41, "total_tokens": 490119141 }, { "epoch": 0.31026506626656664, "grad_norm": 0.8906441926956177, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4963, "tokens_per_second_per_gpu": 17061.17, "total_tokens": 490214181 }, { "epoch": 0.3103275818954739, "grad_norm": 0.8921752572059631, "learning_rate": 2e-05, "loss": 0.6568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4964, "tokens_per_second_per_gpu": 16264.57, "total_tokens": 490307646 }, { "epoch": 0.31039009752438107, "grad_norm": 0.8455761075019836, "learning_rate": 2e-05, "loss": 0.6767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4965, "tokens_per_second_per_gpu": 18889.05, "total_tokens": 490412606 }, { "epoch": 0.3104526131532883, "grad_norm": 0.9149814248085022, "learning_rate": 2e-05, "loss": 0.7395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4966, "tokens_per_second_per_gpu": 18987.44, "total_tokens": 490515747 }, { "epoch": 0.31051512878219556, "grad_norm": 0.8912836313247681, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4967, "tokens_per_second_per_gpu": 17191.16, "total_tokens": 490616434 }, { "epoch": 0.31057764441110275, "grad_norm": 0.8853868842124939, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4968, "tokens_per_second_per_gpu": 17055.85, "total_tokens": 490712045 }, { "epoch": 0.31064016004001, "grad_norm": 0.8775354027748108, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4969, "tokens_per_second_per_gpu": 16104.29, "total_tokens": 490807291 }, { "epoch": 0.31070267566891724, "grad_norm": 0.8796702027320862, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4970, "tokens_per_second_per_gpu": 18212.15, "total_tokens": 490910063 }, { "epoch": 0.3107651912978245, "grad_norm": 0.9236223697662354, "learning_rate": 2e-05, "loss": 0.6749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4971, "tokens_per_second_per_gpu": 16659.68, "total_tokens": 491006475 }, { "epoch": 0.3108277069267317, "grad_norm": 0.8881881237030029, "learning_rate": 2e-05, "loss": 0.7236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4972, "tokens_per_second_per_gpu": 18135.9, "total_tokens": 491110941 }, { "epoch": 0.3108902225556389, "grad_norm": 0.8826248645782471, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4973, "tokens_per_second_per_gpu": 17968.36, "total_tokens": 491210499 }, { "epoch": 0.31095273818454616, "grad_norm": 0.882827639579773, "learning_rate": 2e-05, "loss": 0.6829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4974, "tokens_per_second_per_gpu": 18686.41, "total_tokens": 491312528 }, { "epoch": 0.31101525381345335, "grad_norm": 0.9019566178321838, "learning_rate": 2e-05, "loss": 0.7045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4975, "tokens_per_second_per_gpu": 18154.81, "total_tokens": 491414732 }, { "epoch": 0.3110777694423606, "grad_norm": 0.9432950615882874, "learning_rate": 2e-05, "loss": 0.6917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4976, "tokens_per_second_per_gpu": 16187.59, "total_tokens": 491508256 }, { "epoch": 0.31114028507126784, "grad_norm": 0.9136217832565308, "learning_rate": 2e-05, "loss": 0.7034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4977, "tokens_per_second_per_gpu": 17561.5, "total_tokens": 491608678 }, { "epoch": 0.31120280070017503, "grad_norm": 0.8984993100166321, "learning_rate": 2e-05, "loss": 0.6535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4978, "tokens_per_second_per_gpu": 17059.48, "total_tokens": 491707691 }, { "epoch": 0.3112653163290823, "grad_norm": 0.9249938130378723, "learning_rate": 2e-05, "loss": 0.7074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4979, "tokens_per_second_per_gpu": 17651.93, "total_tokens": 491803816 }, { "epoch": 0.3113278319579895, "grad_norm": 0.8905645608901978, "learning_rate": 2e-05, "loss": 0.6348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4980, "tokens_per_second_per_gpu": 16461.57, "total_tokens": 491897225 }, { "epoch": 0.3113903475868967, "grad_norm": 0.8696878552436829, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4981, "tokens_per_second_per_gpu": 17024.92, "total_tokens": 491997343 }, { "epoch": 0.31145286321580395, "grad_norm": 0.8988864421844482, "learning_rate": 2e-05, "loss": 0.6723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4982, "tokens_per_second_per_gpu": 17831.67, "total_tokens": 492098983 }, { "epoch": 0.3115153788447112, "grad_norm": 0.8936425447463989, "learning_rate": 2e-05, "loss": 0.6761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4983, "tokens_per_second_per_gpu": 16705.35, "total_tokens": 492194898 }, { "epoch": 0.3115778944736184, "grad_norm": 0.8693397641181946, "learning_rate": 2e-05, "loss": 0.6933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4984, "tokens_per_second_per_gpu": 18143.33, "total_tokens": 492295295 }, { "epoch": 0.31164041010252563, "grad_norm": 0.8690624237060547, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4985, "tokens_per_second_per_gpu": 16872.29, "total_tokens": 492395760 }, { "epoch": 0.3117029257314329, "grad_norm": 0.8734309673309326, "learning_rate": 2e-05, "loss": 0.6886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4986, "tokens_per_second_per_gpu": 17861.13, "total_tokens": 492495398 }, { "epoch": 0.31176544136034007, "grad_norm": 0.9264188408851624, "learning_rate": 2e-05, "loss": 0.6954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4987, "tokens_per_second_per_gpu": 17598.24, "total_tokens": 492598240 }, { "epoch": 0.3118279569892473, "grad_norm": 0.9046276211738586, "learning_rate": 2e-05, "loss": 0.6929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4988, "tokens_per_second_per_gpu": 17650.08, "total_tokens": 492697750 }, { "epoch": 0.31189047261815456, "grad_norm": 0.8665385246276855, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4989, "tokens_per_second_per_gpu": 18955.45, "total_tokens": 492799787 }, { "epoch": 0.31195298824706175, "grad_norm": 1.0178496837615967, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4990, "tokens_per_second_per_gpu": 14974.36, "total_tokens": 492889922 }, { "epoch": 0.312015503875969, "grad_norm": 0.9340102672576904, "learning_rate": 2e-05, "loss": 0.69, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4991, "tokens_per_second_per_gpu": 17792.22, "total_tokens": 492990884 }, { "epoch": 0.31207801950487624, "grad_norm": 0.9549524784088135, "learning_rate": 2e-05, "loss": 0.7278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4992, "tokens_per_second_per_gpu": 18911.03, "total_tokens": 493092860 }, { "epoch": 0.3121405351337834, "grad_norm": 0.9566662311553955, "learning_rate": 2e-05, "loss": 0.6898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4993, "tokens_per_second_per_gpu": 16581.98, "total_tokens": 493186764 }, { "epoch": 0.31220305076269067, "grad_norm": 0.9103794693946838, "learning_rate": 2e-05, "loss": 0.6762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4994, "tokens_per_second_per_gpu": 17085.38, "total_tokens": 493283362 }, { "epoch": 0.3122655663915979, "grad_norm": 0.8912706971168518, "learning_rate": 2e-05, "loss": 0.7021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4995, "tokens_per_second_per_gpu": 18515.51, "total_tokens": 493386208 }, { "epoch": 0.3123280820205051, "grad_norm": 0.8605836033821106, "learning_rate": 2e-05, "loss": 0.6247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4996, "tokens_per_second_per_gpu": 17248.45, "total_tokens": 493485071 }, { "epoch": 0.31239059764941235, "grad_norm": 0.8802466988563538, "learning_rate": 2e-05, "loss": 0.6086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4997, "tokens_per_second_per_gpu": 16975.66, "total_tokens": 493578983 }, { "epoch": 0.3124531132783196, "grad_norm": 0.9084078073501587, "learning_rate": 2e-05, "loss": 0.7169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4998, "tokens_per_second_per_gpu": 17744.09, "total_tokens": 493678800 }, { "epoch": 0.3125156289072268, "grad_norm": 0.9082481861114502, "learning_rate": 2e-05, "loss": 0.6908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 4999, "tokens_per_second_per_gpu": 17246.73, "total_tokens": 493775163 }, { "epoch": 0.31257814453613403, "grad_norm": 0.9123373031616211, "learning_rate": 2e-05, "loss": 0.6663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5000, "tokens_per_second_per_gpu": 16751.27, "total_tokens": 493872357 }, { "epoch": 0.3126406601650413, "grad_norm": 0.8871045112609863, "learning_rate": 2e-05, "loss": 0.6954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5001, "tokens_per_second_per_gpu": 17734.44, "total_tokens": 493970707 }, { "epoch": 0.31270317579394846, "grad_norm": 0.8792181611061096, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5002, "tokens_per_second_per_gpu": 17544.32, "total_tokens": 494072927 }, { "epoch": 0.3127656914228557, "grad_norm": 0.9192419052124023, "learning_rate": 2e-05, "loss": 0.6758, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5003, "tokens_per_second_per_gpu": 16497.42, "total_tokens": 494170130 }, { "epoch": 0.31282820705176295, "grad_norm": 0.9387195706367493, "learning_rate": 2e-05, "loss": 0.6893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5004, "tokens_per_second_per_gpu": 17125.83, "total_tokens": 494268793 }, { "epoch": 0.31289072268067014, "grad_norm": 0.9283089637756348, "learning_rate": 2e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5005, "tokens_per_second_per_gpu": 17399.86, "total_tokens": 494365884 }, { "epoch": 0.3129532383095774, "grad_norm": 0.8975427746772766, "learning_rate": 2e-05, "loss": 0.6626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5006, "tokens_per_second_per_gpu": 17683.92, "total_tokens": 494465497 }, { "epoch": 0.31301575393848463, "grad_norm": 0.8885202407836914, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5007, "tokens_per_second_per_gpu": 17524.94, "total_tokens": 494566084 }, { "epoch": 0.3130782695673918, "grad_norm": 0.8963393568992615, "learning_rate": 2e-05, "loss": 0.6682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5008, "tokens_per_second_per_gpu": 17666.38, "total_tokens": 494663097 }, { "epoch": 0.31314078519629907, "grad_norm": 0.9748536348342896, "learning_rate": 2e-05, "loss": 0.6878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5009, "tokens_per_second_per_gpu": 17896.52, "total_tokens": 494761674 }, { "epoch": 0.3132033008252063, "grad_norm": 0.8718572854995728, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5010, "tokens_per_second_per_gpu": 17119.46, "total_tokens": 494861437 }, { "epoch": 0.31326581645411355, "grad_norm": 0.9064669013023376, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5011, "tokens_per_second_per_gpu": 16776.57, "total_tokens": 494958653 }, { "epoch": 0.31332833208302074, "grad_norm": 0.882546603679657, "learning_rate": 2e-05, "loss": 0.6775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5012, "tokens_per_second_per_gpu": 17382.56, "total_tokens": 495058507 }, { "epoch": 0.313390847711928, "grad_norm": 0.9221593737602234, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5013, "tokens_per_second_per_gpu": 17351.7, "total_tokens": 495155046 }, { "epoch": 0.31345336334083523, "grad_norm": 0.9050133228302002, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5014, "tokens_per_second_per_gpu": 17159.94, "total_tokens": 495248242 }, { "epoch": 0.3135158789697424, "grad_norm": 0.9430257678031921, "learning_rate": 2e-05, "loss": 0.6849, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5015, "tokens_per_second_per_gpu": 16701.25, "total_tokens": 495343054 }, { "epoch": 0.31357839459864967, "grad_norm": 0.9410940408706665, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5016, "tokens_per_second_per_gpu": 15199.99, "total_tokens": 495433850 }, { "epoch": 0.3136409102275569, "grad_norm": 0.9234497547149658, "learning_rate": 2e-05, "loss": 0.688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5017, "tokens_per_second_per_gpu": 16638.49, "total_tokens": 495528626 }, { "epoch": 0.3137034258564641, "grad_norm": 0.8999039530754089, "learning_rate": 2e-05, "loss": 0.6875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5018, "tokens_per_second_per_gpu": 17698.21, "total_tokens": 495629000 }, { "epoch": 0.31376594148537135, "grad_norm": 0.8745518922805786, "learning_rate": 2e-05, "loss": 0.6677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5019, "tokens_per_second_per_gpu": 17653.69, "total_tokens": 495728848 }, { "epoch": 0.3138284571142786, "grad_norm": 0.8999359607696533, "learning_rate": 2e-05, "loss": 0.673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5020, "tokens_per_second_per_gpu": 16261.94, "total_tokens": 495823972 }, { "epoch": 0.3138909727431858, "grad_norm": 0.8881751894950867, "learning_rate": 2e-05, "loss": 0.6839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5021, "tokens_per_second_per_gpu": 18177.47, "total_tokens": 495925138 }, { "epoch": 0.313953488372093, "grad_norm": 0.8811730146408081, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5022, "tokens_per_second_per_gpu": 16679.84, "total_tokens": 496022341 }, { "epoch": 0.31401600400100027, "grad_norm": 0.9211058616638184, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5023, "tokens_per_second_per_gpu": 17086.62, "total_tokens": 496120438 }, { "epoch": 0.31407851962990746, "grad_norm": 0.9079728722572327, "learning_rate": 2e-05, "loss": 0.6791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5024, "tokens_per_second_per_gpu": 17060.95, "total_tokens": 496218910 }, { "epoch": 0.3141410352588147, "grad_norm": 0.8935399651527405, "learning_rate": 2e-05, "loss": 0.6874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5025, "tokens_per_second_per_gpu": 17517.41, "total_tokens": 496317346 }, { "epoch": 0.31420355088772195, "grad_norm": 0.8753993511199951, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5026, "tokens_per_second_per_gpu": 17315.85, "total_tokens": 496415785 }, { "epoch": 0.31426606651662914, "grad_norm": 0.863068699836731, "learning_rate": 2e-05, "loss": 0.6896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5027, "tokens_per_second_per_gpu": 17966.48, "total_tokens": 496518917 }, { "epoch": 0.3143285821455364, "grad_norm": 0.8691979050636292, "learning_rate": 2e-05, "loss": 0.6634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5028, "tokens_per_second_per_gpu": 16273.07, "total_tokens": 496617083 }, { "epoch": 0.31439109777444363, "grad_norm": 0.920353889465332, "learning_rate": 2e-05, "loss": 0.7205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5029, "tokens_per_second_per_gpu": 18435.79, "total_tokens": 496717124 }, { "epoch": 0.3144536134033508, "grad_norm": 0.8797393441200256, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5030, "tokens_per_second_per_gpu": 16867.43, "total_tokens": 496817326 }, { "epoch": 0.31451612903225806, "grad_norm": 0.8592624068260193, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5031, "tokens_per_second_per_gpu": 17176.42, "total_tokens": 496917165 }, { "epoch": 0.3145786446611653, "grad_norm": 0.8921416997909546, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5032, "tokens_per_second_per_gpu": 15842.96, "total_tokens": 497013661 }, { "epoch": 0.3146411602900725, "grad_norm": 0.8580068945884705, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5033, "tokens_per_second_per_gpu": 18696.11, "total_tokens": 497113754 }, { "epoch": 0.31470367591897974, "grad_norm": 0.9020376205444336, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5034, "tokens_per_second_per_gpu": 17112.33, "total_tokens": 497210067 }, { "epoch": 0.314766191547887, "grad_norm": 0.9059081077575684, "learning_rate": 2e-05, "loss": 0.6793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5035, "tokens_per_second_per_gpu": 17005.4, "total_tokens": 497309791 }, { "epoch": 0.3148287071767942, "grad_norm": 0.9209501147270203, "learning_rate": 2e-05, "loss": 0.6763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5036, "tokens_per_second_per_gpu": 18368.78, "total_tokens": 497412491 }, { "epoch": 0.3148912228057014, "grad_norm": 0.8542002439498901, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5037, "tokens_per_second_per_gpu": 16991.86, "total_tokens": 497510943 }, { "epoch": 0.31495373843460867, "grad_norm": 0.8829591274261475, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5038, "tokens_per_second_per_gpu": 17889.57, "total_tokens": 497608487 }, { "epoch": 0.31501625406351585, "grad_norm": 0.876904308795929, "learning_rate": 2e-05, "loss": 0.6839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5039, "tokens_per_second_per_gpu": 17871.05, "total_tokens": 497709479 }, { "epoch": 0.3150787696924231, "grad_norm": 0.9461148977279663, "learning_rate": 2e-05, "loss": 0.7037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5040, "tokens_per_second_per_gpu": 16753.77, "total_tokens": 497807770 }, { "epoch": 0.31514128532133034, "grad_norm": 1.0296456813812256, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5041, "tokens_per_second_per_gpu": 17549.85, "total_tokens": 497905900 }, { "epoch": 0.31520380095023753, "grad_norm": 0.8904100060462952, "learning_rate": 2e-05, "loss": 0.6801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5042, "tokens_per_second_per_gpu": 18599.13, "total_tokens": 498007847 }, { "epoch": 0.3152663165791448, "grad_norm": 0.8670162558555603, "learning_rate": 2e-05, "loss": 0.6338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5043, "tokens_per_second_per_gpu": 17426.58, "total_tokens": 498106742 }, { "epoch": 0.315328832208052, "grad_norm": 0.9335365891456604, "learning_rate": 2e-05, "loss": 0.6924, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5044, "tokens_per_second_per_gpu": 17268.11, "total_tokens": 498205877 }, { "epoch": 0.3153913478369592, "grad_norm": 0.9368121027946472, "learning_rate": 2e-05, "loss": 0.7118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5045, "tokens_per_second_per_gpu": 17440.34, "total_tokens": 498306567 }, { "epoch": 0.31545386346586646, "grad_norm": 0.9380207657814026, "learning_rate": 2e-05, "loss": 0.7092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5046, "tokens_per_second_per_gpu": 17291.22, "total_tokens": 498403500 }, { "epoch": 0.3155163790947737, "grad_norm": 0.8983582854270935, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5047, "tokens_per_second_per_gpu": 16883.42, "total_tokens": 498499765 }, { "epoch": 0.31557889472368095, "grad_norm": 0.8590087294578552, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5048, "tokens_per_second_per_gpu": 17167.94, "total_tokens": 498598058 }, { "epoch": 0.31564141035258814, "grad_norm": 0.9178266525268555, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5049, "tokens_per_second_per_gpu": 17385.14, "total_tokens": 498694247 }, { "epoch": 0.3157039259814954, "grad_norm": 0.8650199770927429, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5050, "tokens_per_second_per_gpu": 16567.0, "total_tokens": 498789682 }, { "epoch": 0.3157664416104026, "grad_norm": 0.8820925354957581, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5051, "tokens_per_second_per_gpu": 16362.55, "total_tokens": 498887622 }, { "epoch": 0.3158289572393098, "grad_norm": 0.9066751003265381, "learning_rate": 2e-05, "loss": 0.6517, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5052, "tokens_per_second_per_gpu": 16188.28, "total_tokens": 498979595 }, { "epoch": 0.31589147286821706, "grad_norm": 0.8666742444038391, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5053, "tokens_per_second_per_gpu": 18353.56, "total_tokens": 499080396 }, { "epoch": 0.3159539884971243, "grad_norm": 0.9283605813980103, "learning_rate": 2e-05, "loss": 0.6865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5054, "tokens_per_second_per_gpu": 17779.98, "total_tokens": 499179553 }, { "epoch": 0.3160165041260315, "grad_norm": 0.8652984499931335, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5055, "tokens_per_second_per_gpu": 17770.17, "total_tokens": 499277170 }, { "epoch": 0.31607901975493874, "grad_norm": 0.9384445548057556, "learning_rate": 2e-05, "loss": 0.6843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5056, "tokens_per_second_per_gpu": 16829.33, "total_tokens": 499374554 }, { "epoch": 0.316141535383846, "grad_norm": 0.8789353370666504, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5057, "tokens_per_second_per_gpu": 17149.52, "total_tokens": 499473586 }, { "epoch": 0.3162040510127532, "grad_norm": 0.9062649011611938, "learning_rate": 2e-05, "loss": 0.696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5058, "tokens_per_second_per_gpu": 16267.87, "total_tokens": 499570963 }, { "epoch": 0.3162665666416604, "grad_norm": 0.8917474150657654, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5059, "tokens_per_second_per_gpu": 16950.9, "total_tokens": 499669220 }, { "epoch": 0.31632908227056766, "grad_norm": 0.8877677321434021, "learning_rate": 2e-05, "loss": 0.671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5060, "tokens_per_second_per_gpu": 17593.79, "total_tokens": 499770541 }, { "epoch": 0.31639159789947485, "grad_norm": 0.9194186925888062, "learning_rate": 2e-05, "loss": 0.6216, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5061, "tokens_per_second_per_gpu": 17071.46, "total_tokens": 499863381 }, { "epoch": 0.3164541135283821, "grad_norm": 0.8784071803092957, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5062, "tokens_per_second_per_gpu": 16228.22, "total_tokens": 499957667 }, { "epoch": 0.31651662915728934, "grad_norm": 0.9385942220687866, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5063, "tokens_per_second_per_gpu": 16976.38, "total_tokens": 500054333 }, { "epoch": 0.31657914478619653, "grad_norm": 0.8645872473716736, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5064, "tokens_per_second_per_gpu": 16713.01, "total_tokens": 500153123 }, { "epoch": 0.3166416604151038, "grad_norm": 0.8696631193161011, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5065, "tokens_per_second_per_gpu": 17114.41, "total_tokens": 500250916 }, { "epoch": 0.316704176044011, "grad_norm": 0.8979365825653076, "learning_rate": 2e-05, "loss": 0.6984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5066, "tokens_per_second_per_gpu": 18082.18, "total_tokens": 500352170 }, { "epoch": 0.3167666916729182, "grad_norm": 0.8962947726249695, "learning_rate": 2e-05, "loss": 0.7092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5067, "tokens_per_second_per_gpu": 17202.02, "total_tokens": 500450356 }, { "epoch": 0.31682920730182546, "grad_norm": 0.897085964679718, "learning_rate": 2e-05, "loss": 0.6743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5068, "tokens_per_second_per_gpu": 16890.03, "total_tokens": 500549357 }, { "epoch": 0.3168917229307327, "grad_norm": 0.864212691783905, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5069, "tokens_per_second_per_gpu": 16522.0, "total_tokens": 500642878 }, { "epoch": 0.3169542385596399, "grad_norm": 0.8828648328781128, "learning_rate": 2e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5070, "tokens_per_second_per_gpu": 16635.39, "total_tokens": 500738046 }, { "epoch": 0.31701675418854713, "grad_norm": 0.8382775783538818, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5071, "tokens_per_second_per_gpu": 17131.36, "total_tokens": 500838012 }, { "epoch": 0.3170792698174544, "grad_norm": 0.8739805221557617, "learning_rate": 2e-05, "loss": 0.6864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5072, "tokens_per_second_per_gpu": 17580.1, "total_tokens": 500938807 }, { "epoch": 0.31714178544636157, "grad_norm": 0.8842347860336304, "learning_rate": 2e-05, "loss": 0.6718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5073, "tokens_per_second_per_gpu": 17205.02, "total_tokens": 501036296 }, { "epoch": 0.3172043010752688, "grad_norm": 0.8915262818336487, "learning_rate": 2e-05, "loss": 0.6885, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5074, "tokens_per_second_per_gpu": 17416.98, "total_tokens": 501139682 }, { "epoch": 0.31726681670417606, "grad_norm": 0.8771746754646301, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5075, "tokens_per_second_per_gpu": 17643.06, "total_tokens": 501239885 }, { "epoch": 0.31732933233308325, "grad_norm": 0.8966233134269714, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5076, "tokens_per_second_per_gpu": 17238.11, "total_tokens": 501334529 }, { "epoch": 0.3173918479619905, "grad_norm": 0.8786086440086365, "learning_rate": 2e-05, "loss": 0.7045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5077, "tokens_per_second_per_gpu": 17899.01, "total_tokens": 501434835 }, { "epoch": 0.31745436359089774, "grad_norm": 0.9057759046554565, "learning_rate": 2e-05, "loss": 0.7137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5078, "tokens_per_second_per_gpu": 16395.04, "total_tokens": 501531514 }, { "epoch": 0.3175168792198049, "grad_norm": 0.8975349068641663, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5079, "tokens_per_second_per_gpu": 17023.43, "total_tokens": 501627302 }, { "epoch": 0.31757939484871217, "grad_norm": 0.888303279876709, "learning_rate": 2e-05, "loss": 0.672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5080, "tokens_per_second_per_gpu": 16912.84, "total_tokens": 501725583 }, { "epoch": 0.3176419104776194, "grad_norm": 0.8623418211936951, "learning_rate": 2e-05, "loss": 0.6749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5081, "tokens_per_second_per_gpu": 18433.09, "total_tokens": 501828424 }, { "epoch": 0.3177044261065266, "grad_norm": 0.9298983812332153, "learning_rate": 2e-05, "loss": 0.6864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5082, "tokens_per_second_per_gpu": 17374.22, "total_tokens": 501928478 }, { "epoch": 0.31776694173543385, "grad_norm": 0.8913618922233582, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5083, "tokens_per_second_per_gpu": 16575.88, "total_tokens": 502023826 }, { "epoch": 0.3178294573643411, "grad_norm": 0.8944481611251831, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5084, "tokens_per_second_per_gpu": 16785.0, "total_tokens": 502117469 }, { "epoch": 0.31789197299324834, "grad_norm": 0.8987662196159363, "learning_rate": 2e-05, "loss": 0.6676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5085, "tokens_per_second_per_gpu": 17035.07, "total_tokens": 502214533 }, { "epoch": 0.31795448862215553, "grad_norm": 0.911911129951477, "learning_rate": 2e-05, "loss": 0.7047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5086, "tokens_per_second_per_gpu": 17286.62, "total_tokens": 502314195 }, { "epoch": 0.3180170042510628, "grad_norm": 0.8932804465293884, "learning_rate": 2e-05, "loss": 0.6825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5087, "tokens_per_second_per_gpu": 17785.19, "total_tokens": 502415632 }, { "epoch": 0.31807951987997, "grad_norm": 0.8820117712020874, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5088, "tokens_per_second_per_gpu": 17595.74, "total_tokens": 502513296 }, { "epoch": 0.3181420355088772, "grad_norm": 0.9160833954811096, "learning_rate": 2e-05, "loss": 0.6848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5089, "tokens_per_second_per_gpu": 17677.89, "total_tokens": 502612765 }, { "epoch": 0.31820455113778445, "grad_norm": 0.9146305322647095, "learning_rate": 2e-05, "loss": 0.6904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5090, "tokens_per_second_per_gpu": 17208.49, "total_tokens": 502709390 }, { "epoch": 0.3182670667666917, "grad_norm": 0.9318079352378845, "learning_rate": 2e-05, "loss": 0.6872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5091, "tokens_per_second_per_gpu": 17405.31, "total_tokens": 502805668 }, { "epoch": 0.3183295823955989, "grad_norm": 0.9042184352874756, "learning_rate": 2e-05, "loss": 0.6757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5092, "tokens_per_second_per_gpu": 17708.42, "total_tokens": 502903962 }, { "epoch": 0.31839209802450613, "grad_norm": 0.895799994468689, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5093, "tokens_per_second_per_gpu": 16923.77, "total_tokens": 502998998 }, { "epoch": 0.3184546136534134, "grad_norm": 0.914583146572113, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5094, "tokens_per_second_per_gpu": 17759.99, "total_tokens": 503096648 }, { "epoch": 0.31851712928232057, "grad_norm": 0.9816442131996155, "learning_rate": 2e-05, "loss": 0.6833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5095, "tokens_per_second_per_gpu": 16934.53, "total_tokens": 503193809 }, { "epoch": 0.3185796449112278, "grad_norm": 0.8966516256332397, "learning_rate": 2e-05, "loss": 0.6468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5096, "tokens_per_second_per_gpu": 17399.45, "total_tokens": 503287547 }, { "epoch": 0.31864216054013506, "grad_norm": 0.9106066823005676, "learning_rate": 2e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5097, "tokens_per_second_per_gpu": 16751.64, "total_tokens": 503384856 }, { "epoch": 0.31870467616904224, "grad_norm": 0.8911319375038147, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5098, "tokens_per_second_per_gpu": 17449.62, "total_tokens": 503478971 }, { "epoch": 0.3187671917979495, "grad_norm": 0.933927595615387, "learning_rate": 2e-05, "loss": 0.6844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5099, "tokens_per_second_per_gpu": 16846.92, "total_tokens": 503577121 }, { "epoch": 0.31882970742685673, "grad_norm": 0.9252758622169495, "learning_rate": 2e-05, "loss": 0.6534, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5100, "tokens_per_second_per_gpu": 16763.81, "total_tokens": 503672975 }, { "epoch": 0.3188922230557639, "grad_norm": 0.9451883435249329, "learning_rate": 2e-05, "loss": 0.67, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5101, "tokens_per_second_per_gpu": 17252.65, "total_tokens": 503771666 }, { "epoch": 0.31895473868467117, "grad_norm": 0.8825212717056274, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5102, "tokens_per_second_per_gpu": 17037.01, "total_tokens": 503868382 }, { "epoch": 0.3190172543135784, "grad_norm": 0.9993759393692017, "learning_rate": 2e-05, "loss": 0.6704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5103, "tokens_per_second_per_gpu": 17102.14, "total_tokens": 503963813 }, { "epoch": 0.3190797699424856, "grad_norm": 0.8743468523025513, "learning_rate": 2e-05, "loss": 0.6783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5104, "tokens_per_second_per_gpu": 15917.28, "total_tokens": 504058397 }, { "epoch": 0.31914228557139285, "grad_norm": 0.9566866159439087, "learning_rate": 2e-05, "loss": 0.6882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5105, "tokens_per_second_per_gpu": 16814.13, "total_tokens": 504157257 }, { "epoch": 0.3192048012003001, "grad_norm": 0.9222390651702881, "learning_rate": 2e-05, "loss": 0.6814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5106, "tokens_per_second_per_gpu": 17217.76, "total_tokens": 504253199 }, { "epoch": 0.3192673168292073, "grad_norm": 0.9041520357131958, "learning_rate": 2e-05, "loss": 0.7337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5107, "tokens_per_second_per_gpu": 16842.59, "total_tokens": 504355117 }, { "epoch": 0.3193298324581145, "grad_norm": 0.9313990473747253, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5108, "tokens_per_second_per_gpu": 17919.03, "total_tokens": 504453869 }, { "epoch": 0.31939234808702177, "grad_norm": 0.9511215090751648, "learning_rate": 2e-05, "loss": 0.6769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5109, "tokens_per_second_per_gpu": 16231.32, "total_tokens": 504547185 }, { "epoch": 0.31945486371592896, "grad_norm": 0.9238777756690979, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5110, "tokens_per_second_per_gpu": 16273.16, "total_tokens": 504639428 }, { "epoch": 0.3195173793448362, "grad_norm": 0.8983558416366577, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5111, "tokens_per_second_per_gpu": 17964.05, "total_tokens": 504738710 }, { "epoch": 0.31957989497374345, "grad_norm": 0.8986788392066956, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5112, "tokens_per_second_per_gpu": 17594.7, "total_tokens": 504836073 }, { "epoch": 0.31964241060265064, "grad_norm": 0.9063138365745544, "learning_rate": 2e-05, "loss": 0.6939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5113, "tokens_per_second_per_gpu": 17454.67, "total_tokens": 504936210 }, { "epoch": 0.3197049262315579, "grad_norm": 0.9855783581733704, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5114, "tokens_per_second_per_gpu": 16684.99, "total_tokens": 505030132 }, { "epoch": 0.31976744186046513, "grad_norm": 0.8977969884872437, "learning_rate": 2e-05, "loss": 0.6743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5115, "tokens_per_second_per_gpu": 17923.52, "total_tokens": 505128712 }, { "epoch": 0.3198299574893723, "grad_norm": 0.9372837543487549, "learning_rate": 2e-05, "loss": 0.6852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5116, "tokens_per_second_per_gpu": 18380.92, "total_tokens": 505231247 }, { "epoch": 0.31989247311827956, "grad_norm": 0.8846141695976257, "learning_rate": 2e-05, "loss": 0.67, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5117, "tokens_per_second_per_gpu": 17270.75, "total_tokens": 505332665 }, { "epoch": 0.3199549887471868, "grad_norm": 0.917680025100708, "learning_rate": 2e-05, "loss": 0.6752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5118, "tokens_per_second_per_gpu": 16915.32, "total_tokens": 505429662 }, { "epoch": 0.320017504376094, "grad_norm": 0.9527949690818787, "learning_rate": 2e-05, "loss": 0.6995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5119, "tokens_per_second_per_gpu": 17396.64, "total_tokens": 505525488 }, { "epoch": 0.32008002000500124, "grad_norm": 0.9296124577522278, "learning_rate": 2e-05, "loss": 0.6742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5120, "tokens_per_second_per_gpu": 17414.43, "total_tokens": 505626061 }, { "epoch": 0.3201425356339085, "grad_norm": 0.8928413987159729, "learning_rate": 2e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5121, "tokens_per_second_per_gpu": 16257.61, "total_tokens": 505720827 }, { "epoch": 0.3202050512628157, "grad_norm": 0.8740045428276062, "learning_rate": 2e-05, "loss": 0.6507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5122, "tokens_per_second_per_gpu": 16895.21, "total_tokens": 505817250 }, { "epoch": 0.3202675668917229, "grad_norm": 0.8903933167457581, "learning_rate": 2e-05, "loss": 0.6917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5123, "tokens_per_second_per_gpu": 17927.02, "total_tokens": 505917893 }, { "epoch": 0.32033008252063017, "grad_norm": 0.8943025469779968, "learning_rate": 2e-05, "loss": 0.6643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5124, "tokens_per_second_per_gpu": 17252.78, "total_tokens": 506018759 }, { "epoch": 0.3203925981495374, "grad_norm": 0.9083840250968933, "learning_rate": 2e-05, "loss": 0.6873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5125, "tokens_per_second_per_gpu": 18059.04, "total_tokens": 506116223 }, { "epoch": 0.3204551137784446, "grad_norm": 0.9197177290916443, "learning_rate": 2e-05, "loss": 0.689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5126, "tokens_per_second_per_gpu": 16122.61, "total_tokens": 506209159 }, { "epoch": 0.32051762940735185, "grad_norm": 0.9820862412452698, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5127, "tokens_per_second_per_gpu": 18072.97, "total_tokens": 506311323 }, { "epoch": 0.3205801450362591, "grad_norm": 0.910175621509552, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5128, "tokens_per_second_per_gpu": 17854.0, "total_tokens": 506407156 }, { "epoch": 0.3206426606651663, "grad_norm": 0.9180038571357727, "learning_rate": 2e-05, "loss": 0.6813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5129, "tokens_per_second_per_gpu": 18020.45, "total_tokens": 506505102 }, { "epoch": 0.3207051762940735, "grad_norm": 0.9178332090377808, "learning_rate": 2e-05, "loss": 0.6937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5130, "tokens_per_second_per_gpu": 16725.11, "total_tokens": 506602878 }, { "epoch": 0.32076769192298077, "grad_norm": 0.9592701196670532, "learning_rate": 2e-05, "loss": 0.8043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5131, "tokens_per_second_per_gpu": 18024.06, "total_tokens": 506705390 }, { "epoch": 0.32083020755188796, "grad_norm": 0.8714413046836853, "learning_rate": 2e-05, "loss": 0.65, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5132, "tokens_per_second_per_gpu": 16490.9, "total_tokens": 506801781 }, { "epoch": 0.3208927231807952, "grad_norm": 0.8804338574409485, "learning_rate": 2e-05, "loss": 0.7104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5133, "tokens_per_second_per_gpu": 17655.65, "total_tokens": 506902266 }, { "epoch": 0.32095523880970245, "grad_norm": 0.9088912010192871, "learning_rate": 2e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5134, "tokens_per_second_per_gpu": 18582.34, "total_tokens": 507002849 }, { "epoch": 0.32101775443860964, "grad_norm": 0.9066410660743713, "learning_rate": 2e-05, "loss": 0.7099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5135, "tokens_per_second_per_gpu": 17632.11, "total_tokens": 507103410 }, { "epoch": 0.3210802700675169, "grad_norm": 0.9197719097137451, "learning_rate": 2e-05, "loss": 0.7182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5136, "tokens_per_second_per_gpu": 15133.38, "total_tokens": 507197290 }, { "epoch": 0.3211427856964241, "grad_norm": 0.9093880653381348, "learning_rate": 2e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5137, "tokens_per_second_per_gpu": 16859.89, "total_tokens": 507293784 }, { "epoch": 0.3212053013253313, "grad_norm": 0.8929088115692139, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5138, "tokens_per_second_per_gpu": 16591.43, "total_tokens": 507388564 }, { "epoch": 0.32126781695423856, "grad_norm": 0.8920208811759949, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5139, "tokens_per_second_per_gpu": 17406.22, "total_tokens": 507487986 }, { "epoch": 0.3213303325831458, "grad_norm": 0.9081218838691711, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5140, "tokens_per_second_per_gpu": 16696.95, "total_tokens": 507581131 }, { "epoch": 0.321392848212053, "grad_norm": 0.8977354764938354, "learning_rate": 2e-05, "loss": 0.6851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5141, "tokens_per_second_per_gpu": 18160.03, "total_tokens": 507679599 }, { "epoch": 0.32145536384096024, "grad_norm": 0.8584250211715698, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5142, "tokens_per_second_per_gpu": 17410.11, "total_tokens": 507775102 }, { "epoch": 0.3215178794698675, "grad_norm": 0.8840762376785278, "learning_rate": 2e-05, "loss": 0.66, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5143, "tokens_per_second_per_gpu": 16676.77, "total_tokens": 507874131 }, { "epoch": 0.3215803950987747, "grad_norm": 0.9142720103263855, "learning_rate": 2e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5144, "tokens_per_second_per_gpu": 16362.44, "total_tokens": 507970238 }, { "epoch": 0.3216429107276819, "grad_norm": 0.9435860514640808, "learning_rate": 2e-05, "loss": 0.6951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5145, "tokens_per_second_per_gpu": 15574.61, "total_tokens": 508059278 }, { "epoch": 0.32170542635658916, "grad_norm": 0.927049994468689, "learning_rate": 2e-05, "loss": 0.6734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5146, "tokens_per_second_per_gpu": 17194.72, "total_tokens": 508153852 }, { "epoch": 0.32176794198549635, "grad_norm": 0.8971542716026306, "learning_rate": 2e-05, "loss": 0.7132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5147, "tokens_per_second_per_gpu": 17945.9, "total_tokens": 508255609 }, { "epoch": 0.3218304576144036, "grad_norm": 0.9132170677185059, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5148, "tokens_per_second_per_gpu": 17135.68, "total_tokens": 508352473 }, { "epoch": 0.32189297324331084, "grad_norm": 0.9182870388031006, "learning_rate": 2e-05, "loss": 0.7073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5149, "tokens_per_second_per_gpu": 17205.25, "total_tokens": 508450366 }, { "epoch": 0.32195548887221803, "grad_norm": 0.9384421706199646, "learning_rate": 2e-05, "loss": 0.6842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5150, "tokens_per_second_per_gpu": 15651.89, "total_tokens": 508545323 }, { "epoch": 0.3220180045011253, "grad_norm": 0.8918080925941467, "learning_rate": 2e-05, "loss": 0.6833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5151, "tokens_per_second_per_gpu": 16257.73, "total_tokens": 508642644 }, { "epoch": 0.3220805201300325, "grad_norm": 0.9023244976997375, "learning_rate": 2e-05, "loss": 0.6845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5152, "tokens_per_second_per_gpu": 17751.04, "total_tokens": 508744248 }, { "epoch": 0.3221430357589397, "grad_norm": 0.868828535079956, "learning_rate": 2e-05, "loss": 0.6967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5153, "tokens_per_second_per_gpu": 17716.63, "total_tokens": 508844542 }, { "epoch": 0.32220555138784696, "grad_norm": 0.881903350353241, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5154, "tokens_per_second_per_gpu": 17156.85, "total_tokens": 508940270 }, { "epoch": 0.3222680670167542, "grad_norm": 0.8965604901313782, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5155, "tokens_per_second_per_gpu": 16295.5, "total_tokens": 509030931 }, { "epoch": 0.3223305826456614, "grad_norm": 0.8808375000953674, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5156, "tokens_per_second_per_gpu": 17340.43, "total_tokens": 509130494 }, { "epoch": 0.32239309827456863, "grad_norm": 0.9312471747398376, "learning_rate": 2e-05, "loss": 0.7055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5157, "tokens_per_second_per_gpu": 16030.53, "total_tokens": 509225332 }, { "epoch": 0.3224556139034759, "grad_norm": 0.932292640209198, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5158, "tokens_per_second_per_gpu": 17795.79, "total_tokens": 509322993 }, { "epoch": 0.32251812953238307, "grad_norm": 0.8958190679550171, "learning_rate": 2e-05, "loss": 0.6514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5159, "tokens_per_second_per_gpu": 16232.1, "total_tokens": 509419181 }, { "epoch": 0.3225806451612903, "grad_norm": 0.944836437702179, "learning_rate": 2e-05, "loss": 0.6852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5160, "tokens_per_second_per_gpu": 16079.77, "total_tokens": 509509114 }, { "epoch": 0.32264316079019756, "grad_norm": 0.8826062679290771, "learning_rate": 2e-05, "loss": 0.6806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5161, "tokens_per_second_per_gpu": 16948.27, "total_tokens": 509609741 }, { "epoch": 0.3227056764191048, "grad_norm": 0.8984041810035706, "learning_rate": 2e-05, "loss": 0.6598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5162, "tokens_per_second_per_gpu": 17168.62, "total_tokens": 509705422 }, { "epoch": 0.322768192048012, "grad_norm": 0.866014301776886, "learning_rate": 2e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5163, "tokens_per_second_per_gpu": 17184.48, "total_tokens": 509799592 }, { "epoch": 0.32283070767691924, "grad_norm": 0.8455449938774109, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5164, "tokens_per_second_per_gpu": 17363.62, "total_tokens": 509896807 }, { "epoch": 0.3228932233058265, "grad_norm": 0.9040710926055908, "learning_rate": 2e-05, "loss": 0.6978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5165, "tokens_per_second_per_gpu": 17525.03, "total_tokens": 509995482 }, { "epoch": 0.32295573893473367, "grad_norm": 0.9015071392059326, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5166, "tokens_per_second_per_gpu": 17617.03, "total_tokens": 510093955 }, { "epoch": 0.3230182545636409, "grad_norm": 0.881328284740448, "learning_rate": 2e-05, "loss": 0.7111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5167, "tokens_per_second_per_gpu": 19158.83, "total_tokens": 510195239 }, { "epoch": 0.32308077019254816, "grad_norm": 0.906958281993866, "learning_rate": 2e-05, "loss": 0.7362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5168, "tokens_per_second_per_gpu": 18055.52, "total_tokens": 510295710 }, { "epoch": 0.32314328582145535, "grad_norm": 0.8968960046768188, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5169, "tokens_per_second_per_gpu": 17131.53, "total_tokens": 510393481 }, { "epoch": 0.3232058014503626, "grad_norm": 0.9024876356124878, "learning_rate": 2e-05, "loss": 0.6779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5170, "tokens_per_second_per_gpu": 17120.9, "total_tokens": 510492292 }, { "epoch": 0.32326831707926984, "grad_norm": 0.8981790542602539, "learning_rate": 2e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5171, "tokens_per_second_per_gpu": 17059.02, "total_tokens": 510589644 }, { "epoch": 0.32333083270817703, "grad_norm": 0.9029901623725891, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5172, "tokens_per_second_per_gpu": 16953.17, "total_tokens": 510681936 }, { "epoch": 0.3233933483370843, "grad_norm": 0.8946239352226257, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5173, "tokens_per_second_per_gpu": 17271.73, "total_tokens": 510774665 }, { "epoch": 0.3234558639659915, "grad_norm": 0.9497578740119934, "learning_rate": 2e-05, "loss": 0.6771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5174, "tokens_per_second_per_gpu": 16646.53, "total_tokens": 510867480 }, { "epoch": 0.3235183795948987, "grad_norm": 0.8930511474609375, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5175, "tokens_per_second_per_gpu": 16981.89, "total_tokens": 510964892 }, { "epoch": 0.32358089522380595, "grad_norm": 0.9087553024291992, "learning_rate": 2e-05, "loss": 0.6682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5176, "tokens_per_second_per_gpu": 16271.36, "total_tokens": 511059465 }, { "epoch": 0.3236434108527132, "grad_norm": 0.8833349347114563, "learning_rate": 2e-05, "loss": 0.7058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5177, "tokens_per_second_per_gpu": 17649.33, "total_tokens": 511157805 }, { "epoch": 0.3237059264816204, "grad_norm": 0.9117575883865356, "learning_rate": 2e-05, "loss": 0.6788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5178, "tokens_per_second_per_gpu": 17713.1, "total_tokens": 511254849 }, { "epoch": 0.32376844211052763, "grad_norm": 0.9020925760269165, "learning_rate": 2e-05, "loss": 0.6771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5179, "tokens_per_second_per_gpu": 17520.84, "total_tokens": 511350120 }, { "epoch": 0.3238309577394349, "grad_norm": 0.8849661946296692, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5180, "tokens_per_second_per_gpu": 16871.92, "total_tokens": 511444740 }, { "epoch": 0.32389347336834207, "grad_norm": 0.926443338394165, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5181, "tokens_per_second_per_gpu": 17156.08, "total_tokens": 511540374 }, { "epoch": 0.3239559889972493, "grad_norm": 0.9519808292388916, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5182, "tokens_per_second_per_gpu": 17183.93, "total_tokens": 511634214 }, { "epoch": 0.32401850462615656, "grad_norm": 0.9276576042175293, "learning_rate": 2e-05, "loss": 0.6719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5183, "tokens_per_second_per_gpu": 15598.04, "total_tokens": 511723618 }, { "epoch": 0.32408102025506375, "grad_norm": 0.9789696931838989, "learning_rate": 2e-05, "loss": 0.734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5184, "tokens_per_second_per_gpu": 15227.4, "total_tokens": 511816833 }, { "epoch": 0.324143535883971, "grad_norm": 0.8863556981086731, "learning_rate": 2e-05, "loss": 0.729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5185, "tokens_per_second_per_gpu": 17724.32, "total_tokens": 511917582 }, { "epoch": 0.32420605151287823, "grad_norm": 0.8963993787765503, "learning_rate": 2e-05, "loss": 0.6753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5186, "tokens_per_second_per_gpu": 17297.99, "total_tokens": 512012652 }, { "epoch": 0.3242685671417854, "grad_norm": 0.8882578015327454, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5187, "tokens_per_second_per_gpu": 14919.94, "total_tokens": 512107844 }, { "epoch": 0.32433108277069267, "grad_norm": 0.9550426006317139, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5188, "tokens_per_second_per_gpu": 17179.12, "total_tokens": 512204698 }, { "epoch": 0.3243935983995999, "grad_norm": 0.9175539612770081, "learning_rate": 2e-05, "loss": 0.7028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5189, "tokens_per_second_per_gpu": 17201.62, "total_tokens": 512300445 }, { "epoch": 0.3244561140285071, "grad_norm": 0.8781496286392212, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5190, "tokens_per_second_per_gpu": 16091.39, "total_tokens": 512395381 }, { "epoch": 0.32451862965741435, "grad_norm": 0.9373664259910583, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5191, "tokens_per_second_per_gpu": 17172.37, "total_tokens": 512490013 }, { "epoch": 0.3245811452863216, "grad_norm": 0.905414342880249, "learning_rate": 2e-05, "loss": 0.6779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5192, "tokens_per_second_per_gpu": 17745.28, "total_tokens": 512590062 }, { "epoch": 0.3246436609152288, "grad_norm": 0.947437047958374, "learning_rate": 2e-05, "loss": 0.7129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5193, "tokens_per_second_per_gpu": 17639.88, "total_tokens": 512687143 }, { "epoch": 0.324706176544136, "grad_norm": 0.8996438384056091, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5194, "tokens_per_second_per_gpu": 17369.71, "total_tokens": 512781484 }, { "epoch": 0.32476869217304327, "grad_norm": 0.895021915435791, "learning_rate": 2e-05, "loss": 0.6702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5195, "tokens_per_second_per_gpu": 17742.99, "total_tokens": 512879279 }, { "epoch": 0.32483120780195046, "grad_norm": 0.9199827909469604, "learning_rate": 2e-05, "loss": 0.6927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5196, "tokens_per_second_per_gpu": 17451.47, "total_tokens": 512978729 }, { "epoch": 0.3248937234308577, "grad_norm": 0.908925473690033, "learning_rate": 2e-05, "loss": 0.6956, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5197, "tokens_per_second_per_gpu": 16377.98, "total_tokens": 513076046 }, { "epoch": 0.32495623905976495, "grad_norm": 0.9245375394821167, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5198, "tokens_per_second_per_gpu": 16435.81, "total_tokens": 513169535 }, { "epoch": 0.3250187546886722, "grad_norm": 0.9337295889854431, "learning_rate": 2e-05, "loss": 0.6907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5199, "tokens_per_second_per_gpu": 17051.09, "total_tokens": 513264033 }, { "epoch": 0.3250812703175794, "grad_norm": 0.8938278555870056, "learning_rate": 2e-05, "loss": 0.7158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5200, "tokens_per_second_per_gpu": 17312.8, "total_tokens": 513363859 }, { "epoch": 0.32514378594648663, "grad_norm": 0.8735404014587402, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5201, "tokens_per_second_per_gpu": 17134.15, "total_tokens": 513459761 }, { "epoch": 0.3252063015753939, "grad_norm": 0.9228398203849792, "learning_rate": 2e-05, "loss": 0.6883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5202, "tokens_per_second_per_gpu": 18351.54, "total_tokens": 513559990 }, { "epoch": 0.32526881720430106, "grad_norm": 0.8840941190719604, "learning_rate": 2e-05, "loss": 0.6706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5203, "tokens_per_second_per_gpu": 17820.14, "total_tokens": 513657873 }, { "epoch": 0.3253313328332083, "grad_norm": 0.882468581199646, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5204, "tokens_per_second_per_gpu": 18400.4, "total_tokens": 513756785 }, { "epoch": 0.32539384846211555, "grad_norm": 0.893822193145752, "learning_rate": 2e-05, "loss": 0.6575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5205, "tokens_per_second_per_gpu": 17447.02, "total_tokens": 513854501 }, { "epoch": 0.32545636409102274, "grad_norm": 0.9210993647575378, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5206, "tokens_per_second_per_gpu": 17857.59, "total_tokens": 513950865 }, { "epoch": 0.32551887971993, "grad_norm": 0.9021199345588684, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5207, "tokens_per_second_per_gpu": 17825.32, "total_tokens": 514050670 }, { "epoch": 0.32558139534883723, "grad_norm": 0.8886158466339111, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5208, "tokens_per_second_per_gpu": 16252.49, "total_tokens": 514148687 }, { "epoch": 0.3256439109777444, "grad_norm": 0.9165380597114563, "learning_rate": 2e-05, "loss": 0.7079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5209, "tokens_per_second_per_gpu": 16694.3, "total_tokens": 514247494 }, { "epoch": 0.32570642660665167, "grad_norm": 0.8937056660652161, "learning_rate": 2e-05, "loss": 0.6592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5210, "tokens_per_second_per_gpu": 16350.58, "total_tokens": 514343978 }, { "epoch": 0.3257689422355589, "grad_norm": 0.9030241370201111, "learning_rate": 2e-05, "loss": 0.6649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5211, "tokens_per_second_per_gpu": 17349.96, "total_tokens": 514441650 }, { "epoch": 0.3258314578644661, "grad_norm": 0.8744344115257263, "learning_rate": 2e-05, "loss": 0.6721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5212, "tokens_per_second_per_gpu": 18126.68, "total_tokens": 514545832 }, { "epoch": 0.32589397349337335, "grad_norm": 0.9131708741188049, "learning_rate": 2e-05, "loss": 0.6923, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5213, "tokens_per_second_per_gpu": 17404.52, "total_tokens": 514646106 }, { "epoch": 0.3259564891222806, "grad_norm": 0.8932678699493408, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5214, "tokens_per_second_per_gpu": 17258.52, "total_tokens": 514742310 }, { "epoch": 0.3260190047511878, "grad_norm": 0.8823164701461792, "learning_rate": 2e-05, "loss": 0.6467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5215, "tokens_per_second_per_gpu": 17171.86, "total_tokens": 514840179 }, { "epoch": 0.326081520380095, "grad_norm": 0.8938888907432556, "learning_rate": 2e-05, "loss": 0.6564, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5216, "tokens_per_second_per_gpu": 16855.01, "total_tokens": 514934679 }, { "epoch": 0.32614403600900227, "grad_norm": 0.9026588201522827, "learning_rate": 2e-05, "loss": 0.7399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5217, "tokens_per_second_per_gpu": 17363.57, "total_tokens": 515034036 }, { "epoch": 0.32620655163790946, "grad_norm": 0.9252910017967224, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5218, "tokens_per_second_per_gpu": 15575.88, "total_tokens": 515127859 }, { "epoch": 0.3262690672668167, "grad_norm": 0.8872737884521484, "learning_rate": 2e-05, "loss": 0.6748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5219, "tokens_per_second_per_gpu": 18008.96, "total_tokens": 515226986 }, { "epoch": 0.32633158289572395, "grad_norm": 0.9045718312263489, "learning_rate": 2e-05, "loss": 0.7066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5220, "tokens_per_second_per_gpu": 18191.13, "total_tokens": 515328931 }, { "epoch": 0.32639409852463114, "grad_norm": 0.9177193641662598, "learning_rate": 2e-05, "loss": 0.6893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5221, "tokens_per_second_per_gpu": 17447.99, "total_tokens": 515430023 }, { "epoch": 0.3264566141535384, "grad_norm": 0.9025494456291199, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5222, "tokens_per_second_per_gpu": 17768.22, "total_tokens": 515529527 }, { "epoch": 0.3265191297824456, "grad_norm": 0.9276450872421265, "learning_rate": 2e-05, "loss": 0.7125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5223, "tokens_per_second_per_gpu": 18139.26, "total_tokens": 515630851 }, { "epoch": 0.3265816454113528, "grad_norm": 0.8756140470504761, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5224, "tokens_per_second_per_gpu": 16542.06, "total_tokens": 515726446 }, { "epoch": 0.32664416104026006, "grad_norm": 0.9022300839424133, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5225, "tokens_per_second_per_gpu": 16587.64, "total_tokens": 515820679 }, { "epoch": 0.3267066766691673, "grad_norm": 0.9095174074172974, "learning_rate": 2e-05, "loss": 0.6905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5226, "tokens_per_second_per_gpu": 17660.53, "total_tokens": 515919511 }, { "epoch": 0.3267691922980745, "grad_norm": 0.9125518202781677, "learning_rate": 2e-05, "loss": 0.6942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5227, "tokens_per_second_per_gpu": 17065.95, "total_tokens": 516015939 }, { "epoch": 0.32683170792698174, "grad_norm": 0.9234411716461182, "learning_rate": 2e-05, "loss": 0.6807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5228, "tokens_per_second_per_gpu": 17901.82, "total_tokens": 516115742 }, { "epoch": 0.326894223555889, "grad_norm": 0.9061856865882874, "learning_rate": 2e-05, "loss": 0.6739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5229, "tokens_per_second_per_gpu": 18037.5, "total_tokens": 516215631 }, { "epoch": 0.3269567391847962, "grad_norm": 0.8933022618293762, "learning_rate": 2e-05, "loss": 0.6983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5230, "tokens_per_second_per_gpu": 17407.01, "total_tokens": 516315144 }, { "epoch": 0.3270192548137034, "grad_norm": 0.8515549302101135, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5231, "tokens_per_second_per_gpu": 16734.55, "total_tokens": 516413765 }, { "epoch": 0.32708177044261066, "grad_norm": 0.8416997194290161, "learning_rate": 2e-05, "loss": 0.6832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5232, "tokens_per_second_per_gpu": 17630.93, "total_tokens": 516517098 }, { "epoch": 0.32714428607151785, "grad_norm": 0.9023521542549133, "learning_rate": 2e-05, "loss": 0.6825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5233, "tokens_per_second_per_gpu": 18527.09, "total_tokens": 516619091 }, { "epoch": 0.3272068017004251, "grad_norm": 0.9178140759468079, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5234, "tokens_per_second_per_gpu": 16102.99, "total_tokens": 516710703 }, { "epoch": 0.32726931732933234, "grad_norm": 0.9267029762268066, "learning_rate": 2e-05, "loss": 0.6973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5235, "tokens_per_second_per_gpu": 16696.69, "total_tokens": 516805338 }, { "epoch": 0.32733183295823953, "grad_norm": 0.9096778035163879, "learning_rate": 2e-05, "loss": 0.6877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5236, "tokens_per_second_per_gpu": 16665.91, "total_tokens": 516903809 }, { "epoch": 0.3273943485871468, "grad_norm": 0.9017075300216675, "learning_rate": 2e-05, "loss": 0.6748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5237, "tokens_per_second_per_gpu": 17028.3, "total_tokens": 517001180 }, { "epoch": 0.327456864216054, "grad_norm": 0.8876491189002991, "learning_rate": 2e-05, "loss": 0.7061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5238, "tokens_per_second_per_gpu": 18110.62, "total_tokens": 517102840 }, { "epoch": 0.32751937984496127, "grad_norm": 0.9129296541213989, "learning_rate": 2e-05, "loss": 0.6855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5239, "tokens_per_second_per_gpu": 18742.92, "total_tokens": 517207239 }, { "epoch": 0.32758189547386846, "grad_norm": 0.9092218279838562, "learning_rate": 2e-05, "loss": 0.6338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5240, "tokens_per_second_per_gpu": 17597.48, "total_tokens": 517303253 }, { "epoch": 0.3276444111027757, "grad_norm": 0.8898172974586487, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5241, "tokens_per_second_per_gpu": 16677.39, "total_tokens": 517398952 }, { "epoch": 0.32770692673168295, "grad_norm": 0.906278133392334, "learning_rate": 2e-05, "loss": 0.6772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5242, "tokens_per_second_per_gpu": 17843.21, "total_tokens": 517497872 }, { "epoch": 0.32776944236059014, "grad_norm": 0.9028851985931396, "learning_rate": 2e-05, "loss": 0.6896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5243, "tokens_per_second_per_gpu": 17366.15, "total_tokens": 517599125 }, { "epoch": 0.3278319579894974, "grad_norm": 0.9298558235168457, "learning_rate": 2e-05, "loss": 0.7081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5244, "tokens_per_second_per_gpu": 18747.64, "total_tokens": 517702086 }, { "epoch": 0.3278944736184046, "grad_norm": 0.8735672831535339, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5245, "tokens_per_second_per_gpu": 18513.98, "total_tokens": 517802197 }, { "epoch": 0.3279569892473118, "grad_norm": 0.8566532135009766, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5246, "tokens_per_second_per_gpu": 17629.45, "total_tokens": 517901439 }, { "epoch": 0.32801950487621906, "grad_norm": 0.8893297910690308, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5247, "tokens_per_second_per_gpu": 17209.19, "total_tokens": 518002138 }, { "epoch": 0.3280820205051263, "grad_norm": 0.8738352060317993, "learning_rate": 2e-05, "loss": 0.7237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5248, "tokens_per_second_per_gpu": 18103.52, "total_tokens": 518106751 }, { "epoch": 0.3281445361340335, "grad_norm": 0.8627437353134155, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5249, "tokens_per_second_per_gpu": 17629.16, "total_tokens": 518204464 }, { "epoch": 0.32820705176294074, "grad_norm": 0.885006844997406, "learning_rate": 2e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5250, "tokens_per_second_per_gpu": 17236.17, "total_tokens": 518304070 }, { "epoch": 0.328269567391848, "grad_norm": 0.9381754398345947, "learning_rate": 2e-05, "loss": 0.6937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5251, "tokens_per_second_per_gpu": 18070.88, "total_tokens": 518402044 }, { "epoch": 0.32833208302075517, "grad_norm": 0.8746910095214844, "learning_rate": 2e-05, "loss": 0.6422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5252, "tokens_per_second_per_gpu": 17168.7, "total_tokens": 518497077 }, { "epoch": 0.3283945986496624, "grad_norm": 0.9118909239768982, "learning_rate": 2e-05, "loss": 0.6467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5253, "tokens_per_second_per_gpu": 17288.46, "total_tokens": 518593219 }, { "epoch": 0.32845711427856966, "grad_norm": 0.8943979144096375, "learning_rate": 2e-05, "loss": 0.6556, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5254, "tokens_per_second_per_gpu": 18412.52, "total_tokens": 518694000 }, { "epoch": 0.32851962990747685, "grad_norm": 0.9311215281486511, "learning_rate": 2e-05, "loss": 0.6857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5255, "tokens_per_second_per_gpu": 18917.09, "total_tokens": 518795901 }, { "epoch": 0.3285821455363841, "grad_norm": 0.8747282028198242, "learning_rate": 2e-05, "loss": 0.6552, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5256, "tokens_per_second_per_gpu": 17331.5, "total_tokens": 518897795 }, { "epoch": 0.32864466116529134, "grad_norm": 0.8731852769851685, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5257, "tokens_per_second_per_gpu": 18728.38, "total_tokens": 518998879 }, { "epoch": 0.32870717679419853, "grad_norm": 0.8607547879219055, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5258, "tokens_per_second_per_gpu": 18241.02, "total_tokens": 519101100 }, { "epoch": 0.3287696924231058, "grad_norm": 0.8915632963180542, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5259, "tokens_per_second_per_gpu": 16311.55, "total_tokens": 519197351 }, { "epoch": 0.328832208052013, "grad_norm": 0.9724758267402649, "learning_rate": 2e-05, "loss": 0.7036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5260, "tokens_per_second_per_gpu": 17623.34, "total_tokens": 519295993 }, { "epoch": 0.3288947236809202, "grad_norm": 0.8891416788101196, "learning_rate": 2e-05, "loss": 0.6791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5261, "tokens_per_second_per_gpu": 16518.92, "total_tokens": 519392102 }, { "epoch": 0.32895723930982745, "grad_norm": 0.8753184676170349, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5262, "tokens_per_second_per_gpu": 17127.81, "total_tokens": 519491882 }, { "epoch": 0.3290197549387347, "grad_norm": 0.8644348382949829, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5263, "tokens_per_second_per_gpu": 17599.45, "total_tokens": 519591019 }, { "epoch": 0.3290822705676419, "grad_norm": 0.9169090390205383, "learning_rate": 2e-05, "loss": 0.6591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5264, "tokens_per_second_per_gpu": 17128.13, "total_tokens": 519688294 }, { "epoch": 0.32914478619654913, "grad_norm": 0.9073196053504944, "learning_rate": 2e-05, "loss": 0.6956, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5265, "tokens_per_second_per_gpu": 17826.52, "total_tokens": 519788893 }, { "epoch": 0.3292073018254564, "grad_norm": 0.8710669875144958, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5266, "tokens_per_second_per_gpu": 17968.17, "total_tokens": 519886275 }, { "epoch": 0.32926981745436357, "grad_norm": 0.8943158388137817, "learning_rate": 2e-05, "loss": 0.6795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5267, "tokens_per_second_per_gpu": 17874.5, "total_tokens": 519987007 }, { "epoch": 0.3293323330832708, "grad_norm": 0.8702406883239746, "learning_rate": 2e-05, "loss": 0.6655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5268, "tokens_per_second_per_gpu": 17487.15, "total_tokens": 520088546 }, { "epoch": 0.32939484871217806, "grad_norm": 0.8922988772392273, "learning_rate": 2e-05, "loss": 0.6667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5269, "tokens_per_second_per_gpu": 17078.43, "total_tokens": 520183790 }, { "epoch": 0.32945736434108525, "grad_norm": 0.9072796702384949, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5270, "tokens_per_second_per_gpu": 17663.17, "total_tokens": 520283939 }, { "epoch": 0.3295198799699925, "grad_norm": 0.9280468225479126, "learning_rate": 2e-05, "loss": 0.6777, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5271, "tokens_per_second_per_gpu": 18613.52, "total_tokens": 520386177 }, { "epoch": 0.32958239559889974, "grad_norm": 0.8861817121505737, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5272, "tokens_per_second_per_gpu": 17771.29, "total_tokens": 520485637 }, { "epoch": 0.3296449112278069, "grad_norm": 0.9143186807632446, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5273, "tokens_per_second_per_gpu": 17934.23, "total_tokens": 520582608 }, { "epoch": 0.32970742685671417, "grad_norm": 0.8975763320922852, "learning_rate": 2e-05, "loss": 0.6593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5274, "tokens_per_second_per_gpu": 16954.86, "total_tokens": 520681352 }, { "epoch": 0.3297699424856214, "grad_norm": 0.8706844449043274, "learning_rate": 2e-05, "loss": 0.635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5275, "tokens_per_second_per_gpu": 17059.95, "total_tokens": 520779220 }, { "epoch": 0.32983245811452866, "grad_norm": 0.9061569571495056, "learning_rate": 2e-05, "loss": 0.65, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5276, "tokens_per_second_per_gpu": 17613.43, "total_tokens": 520876308 }, { "epoch": 0.32989497374343585, "grad_norm": 0.8945186734199524, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5277, "tokens_per_second_per_gpu": 16247.18, "total_tokens": 520974480 }, { "epoch": 0.3299574893723431, "grad_norm": 0.8578171730041504, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5278, "tokens_per_second_per_gpu": 16590.4, "total_tokens": 521071930 }, { "epoch": 0.33002000500125034, "grad_norm": 0.8886570334434509, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5279, "tokens_per_second_per_gpu": 16527.81, "total_tokens": 521165603 }, { "epoch": 0.3300825206301575, "grad_norm": 0.8589403629302979, "learning_rate": 2e-05, "loss": 0.7144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5280, "tokens_per_second_per_gpu": 17429.4, "total_tokens": 521266323 }, { "epoch": 0.3301450362590648, "grad_norm": 0.8903125524520874, "learning_rate": 2e-05, "loss": 0.6548, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5281, "tokens_per_second_per_gpu": 17816.24, "total_tokens": 521362105 }, { "epoch": 0.330207551887972, "grad_norm": 0.9027239084243774, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5282, "tokens_per_second_per_gpu": 17017.91, "total_tokens": 521460039 }, { "epoch": 0.3302700675168792, "grad_norm": 0.8975613117218018, "learning_rate": 2e-05, "loss": 0.6752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5283, "tokens_per_second_per_gpu": 17139.66, "total_tokens": 521560534 }, { "epoch": 0.33033258314578645, "grad_norm": 0.8879722952842712, "learning_rate": 2e-05, "loss": 0.6882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5284, "tokens_per_second_per_gpu": 17241.11, "total_tokens": 521661314 }, { "epoch": 0.3303950987746937, "grad_norm": 0.9068454504013062, "learning_rate": 2e-05, "loss": 0.6575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5285, "tokens_per_second_per_gpu": 17870.81, "total_tokens": 521758985 }, { "epoch": 0.3304576144036009, "grad_norm": 0.8893747329711914, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5286, "tokens_per_second_per_gpu": 16358.37, "total_tokens": 521858088 }, { "epoch": 0.33052013003250813, "grad_norm": 0.9011427164077759, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5287, "tokens_per_second_per_gpu": 17153.66, "total_tokens": 521955574 }, { "epoch": 0.3305826456614154, "grad_norm": 0.8819950819015503, "learning_rate": 2e-05, "loss": 0.6793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5288, "tokens_per_second_per_gpu": 18254.72, "total_tokens": 522054556 }, { "epoch": 0.33064516129032256, "grad_norm": 0.9034757614135742, "learning_rate": 2e-05, "loss": 0.6652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5289, "tokens_per_second_per_gpu": 16541.01, "total_tokens": 522150654 }, { "epoch": 0.3307076769192298, "grad_norm": 0.9026777744293213, "learning_rate": 2e-05, "loss": 0.6206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5290, "tokens_per_second_per_gpu": 15463.4, "total_tokens": 522244986 }, { "epoch": 0.33077019254813705, "grad_norm": 0.9072816371917725, "learning_rate": 2e-05, "loss": 0.6912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5291, "tokens_per_second_per_gpu": 17921.66, "total_tokens": 522346337 }, { "epoch": 0.33083270817704424, "grad_norm": 0.8850441575050354, "learning_rate": 2e-05, "loss": 0.648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5292, "tokens_per_second_per_gpu": 16913.57, "total_tokens": 522443719 }, { "epoch": 0.3308952238059515, "grad_norm": 0.9081660509109497, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5293, "tokens_per_second_per_gpu": 16685.17, "total_tokens": 522541147 }, { "epoch": 0.33095773943485873, "grad_norm": 0.8686012625694275, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5294, "tokens_per_second_per_gpu": 17142.68, "total_tokens": 522640052 }, { "epoch": 0.3310202550637659, "grad_norm": 0.9265127778053284, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5295, "tokens_per_second_per_gpu": 17088.36, "total_tokens": 522740238 }, { "epoch": 0.33108277069267317, "grad_norm": 0.9101846218109131, "learning_rate": 2e-05, "loss": 0.6541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5296, "tokens_per_second_per_gpu": 16515.88, "total_tokens": 522838389 }, { "epoch": 0.3311452863215804, "grad_norm": 0.8912482261657715, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5297, "tokens_per_second_per_gpu": 17350.66, "total_tokens": 522935941 }, { "epoch": 0.3312078019504876, "grad_norm": 0.9096218943595886, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5298, "tokens_per_second_per_gpu": 16759.12, "total_tokens": 523031974 }, { "epoch": 0.33127031757939485, "grad_norm": 0.8898118734359741, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5299, "tokens_per_second_per_gpu": 17846.79, "total_tokens": 523135803 }, { "epoch": 0.3313328332083021, "grad_norm": 0.9355776309967041, "learning_rate": 2e-05, "loss": 0.6725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5300, "tokens_per_second_per_gpu": 17806.15, "total_tokens": 523235973 }, { "epoch": 0.3313953488372093, "grad_norm": 0.9077954888343811, "learning_rate": 2e-05, "loss": 0.6684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5301, "tokens_per_second_per_gpu": 16804.94, "total_tokens": 523331141 }, { "epoch": 0.3314578644661165, "grad_norm": 0.8693501949310303, "learning_rate": 2e-05, "loss": 0.6782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5302, "tokens_per_second_per_gpu": 17623.05, "total_tokens": 523429160 }, { "epoch": 0.33152038009502377, "grad_norm": 0.873828113079071, "learning_rate": 2e-05, "loss": 0.6773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5303, "tokens_per_second_per_gpu": 18331.75, "total_tokens": 523531362 }, { "epoch": 0.33158289572393096, "grad_norm": 0.9718504548072815, "learning_rate": 2e-05, "loss": 0.6318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5304, "tokens_per_second_per_gpu": 16765.75, "total_tokens": 523625333 }, { "epoch": 0.3316454113528382, "grad_norm": 0.9022431969642639, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5305, "tokens_per_second_per_gpu": 17764.42, "total_tokens": 523723442 }, { "epoch": 0.33170792698174545, "grad_norm": 0.884693443775177, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5306, "tokens_per_second_per_gpu": 16821.84, "total_tokens": 523820395 }, { "epoch": 0.33177044261065264, "grad_norm": 0.9232561588287354, "learning_rate": 2e-05, "loss": 0.6995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5307, "tokens_per_second_per_gpu": 17788.85, "total_tokens": 523918425 }, { "epoch": 0.3318329582395599, "grad_norm": 0.8554764986038208, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5308, "tokens_per_second_per_gpu": 17718.37, "total_tokens": 524020807 }, { "epoch": 0.33189547386846713, "grad_norm": 0.8979662656784058, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5309, "tokens_per_second_per_gpu": 17041.98, "total_tokens": 524116871 }, { "epoch": 0.3319579894973743, "grad_norm": 0.92364901304245, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5310, "tokens_per_second_per_gpu": 16547.01, "total_tokens": 524213469 }, { "epoch": 0.33202050512628156, "grad_norm": 0.9126404523849487, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5311, "tokens_per_second_per_gpu": 16105.3, "total_tokens": 524310018 }, { "epoch": 0.3320830207551888, "grad_norm": 0.9158374071121216, "learning_rate": 2e-05, "loss": 0.6798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5312, "tokens_per_second_per_gpu": 17574.13, "total_tokens": 524409518 }, { "epoch": 0.33214553638409605, "grad_norm": 0.9152207374572754, "learning_rate": 2e-05, "loss": 0.6952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5313, "tokens_per_second_per_gpu": 17016.48, "total_tokens": 524508902 }, { "epoch": 0.33220805201300324, "grad_norm": 0.9187110066413879, "learning_rate": 2e-05, "loss": 0.6953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5314, "tokens_per_second_per_gpu": 16856.76, "total_tokens": 524603421 }, { "epoch": 0.3322705676419105, "grad_norm": 0.8984391689300537, "learning_rate": 2e-05, "loss": 0.6896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5315, "tokens_per_second_per_gpu": 16979.59, "total_tokens": 524701664 }, { "epoch": 0.33233308327081773, "grad_norm": 0.8790533542633057, "learning_rate": 2e-05, "loss": 0.6661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5316, "tokens_per_second_per_gpu": 17801.66, "total_tokens": 524803878 }, { "epoch": 0.3323955988997249, "grad_norm": 0.8801034092903137, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5317, "tokens_per_second_per_gpu": 17023.05, "total_tokens": 524901353 }, { "epoch": 0.33245811452863216, "grad_norm": 0.9014673233032227, "learning_rate": 2e-05, "loss": 0.6873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5318, "tokens_per_second_per_gpu": 18373.55, "total_tokens": 525003304 }, { "epoch": 0.3325206301575394, "grad_norm": 0.8972218036651611, "learning_rate": 2e-05, "loss": 0.7119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5319, "tokens_per_second_per_gpu": 18240.28, "total_tokens": 525106165 }, { "epoch": 0.3325831457864466, "grad_norm": 0.8909901976585388, "learning_rate": 2e-05, "loss": 0.7017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5320, "tokens_per_second_per_gpu": 18949.73, "total_tokens": 525213830 }, { "epoch": 0.33264566141535384, "grad_norm": 0.8593877553939819, "learning_rate": 2e-05, "loss": 0.66, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5321, "tokens_per_second_per_gpu": 19249.63, "total_tokens": 525320265 }, { "epoch": 0.3327081770442611, "grad_norm": 0.9238847494125366, "learning_rate": 2e-05, "loss": 0.7132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5322, "tokens_per_second_per_gpu": 17909.17, "total_tokens": 525418595 }, { "epoch": 0.3327706926731683, "grad_norm": 0.9652770161628723, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5323, "tokens_per_second_per_gpu": 16343.73, "total_tokens": 525516199 }, { "epoch": 0.3328332083020755, "grad_norm": 0.9385060667991638, "learning_rate": 2e-05, "loss": 0.6457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5324, "tokens_per_second_per_gpu": 16660.68, "total_tokens": 525610672 }, { "epoch": 0.33289572393098277, "grad_norm": 0.8892576694488525, "learning_rate": 2e-05, "loss": 0.6862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5325, "tokens_per_second_per_gpu": 17281.54, "total_tokens": 525709707 }, { "epoch": 0.33295823955988996, "grad_norm": 0.9023972153663635, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5326, "tokens_per_second_per_gpu": 17208.89, "total_tokens": 525808556 }, { "epoch": 0.3330207551887972, "grad_norm": 0.877837061882019, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5327, "tokens_per_second_per_gpu": 18169.56, "total_tokens": 525910039 }, { "epoch": 0.33308327081770445, "grad_norm": 0.9000400900840759, "learning_rate": 2e-05, "loss": 0.7195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5328, "tokens_per_second_per_gpu": 17035.4, "total_tokens": 526010006 }, { "epoch": 0.33314578644661164, "grad_norm": 0.9192623496055603, "learning_rate": 2e-05, "loss": 0.6951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5329, "tokens_per_second_per_gpu": 16849.1, "total_tokens": 526105178 }, { "epoch": 0.3332083020755189, "grad_norm": 0.9157998561859131, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5330, "tokens_per_second_per_gpu": 16135.42, "total_tokens": 526196931 }, { "epoch": 0.3332708177044261, "grad_norm": 0.8763741254806519, "learning_rate": 2e-05, "loss": 0.6798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5331, "tokens_per_second_per_gpu": 18453.51, "total_tokens": 526300494 }, { "epoch": 0.3333333333333333, "grad_norm": 0.8698432445526123, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5332, "tokens_per_second_per_gpu": 18076.58, "total_tokens": 526403253 }, { "epoch": 0.33339584896224056, "grad_norm": 0.9085890054702759, "learning_rate": 2e-05, "loss": 0.6725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5333, "tokens_per_second_per_gpu": 17707.29, "total_tokens": 526502797 }, { "epoch": 0.3334583645911478, "grad_norm": 0.9113527536392212, "learning_rate": 2e-05, "loss": 0.732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5334, "tokens_per_second_per_gpu": 17747.41, "total_tokens": 526605575 }, { "epoch": 0.333520880220055, "grad_norm": 0.8894237875938416, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5335, "tokens_per_second_per_gpu": 16481.12, "total_tokens": 526701669 }, { "epoch": 0.33358339584896224, "grad_norm": 0.9163236021995544, "learning_rate": 2e-05, "loss": 0.6726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5336, "tokens_per_second_per_gpu": 17600.76, "total_tokens": 526799006 }, { "epoch": 0.3336459114778695, "grad_norm": 0.9225397706031799, "learning_rate": 2e-05, "loss": 0.6658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5337, "tokens_per_second_per_gpu": 15715.95, "total_tokens": 526893623 }, { "epoch": 0.3337084271067767, "grad_norm": 0.9167051911354065, "learning_rate": 2e-05, "loss": 0.657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5338, "tokens_per_second_per_gpu": 16868.68, "total_tokens": 526993517 }, { "epoch": 0.3337709427356839, "grad_norm": 0.9239954948425293, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5339, "tokens_per_second_per_gpu": 16614.19, "total_tokens": 527087545 }, { "epoch": 0.33383345836459116, "grad_norm": 0.8954859375953674, "learning_rate": 2e-05, "loss": 0.6923, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5340, "tokens_per_second_per_gpu": 17064.29, "total_tokens": 527185926 }, { "epoch": 0.33389597399349835, "grad_norm": 0.9172598719596863, "learning_rate": 2e-05, "loss": 0.6728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5341, "tokens_per_second_per_gpu": 16205.84, "total_tokens": 527283181 }, { "epoch": 0.3339584896224056, "grad_norm": 0.9422310590744019, "learning_rate": 2e-05, "loss": 0.7143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5342, "tokens_per_second_per_gpu": 17560.4, "total_tokens": 527381696 }, { "epoch": 0.33402100525131284, "grad_norm": 0.8834006190299988, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5343, "tokens_per_second_per_gpu": 17469.38, "total_tokens": 527482431 }, { "epoch": 0.33408352088022003, "grad_norm": 0.9106248617172241, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5344, "tokens_per_second_per_gpu": 17396.61, "total_tokens": 527582006 }, { "epoch": 0.3341460365091273, "grad_norm": 0.9005135893821716, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5345, "tokens_per_second_per_gpu": 17279.64, "total_tokens": 527682556 }, { "epoch": 0.3342085521380345, "grad_norm": 0.8882275223731995, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5346, "tokens_per_second_per_gpu": 17345.86, "total_tokens": 527782804 }, { "epoch": 0.3342710677669417, "grad_norm": 0.9400073289871216, "learning_rate": 2e-05, "loss": 0.6738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5347, "tokens_per_second_per_gpu": 18461.49, "total_tokens": 527884831 }, { "epoch": 0.33433358339584895, "grad_norm": 0.845649242401123, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5348, "tokens_per_second_per_gpu": 17812.96, "total_tokens": 527985053 }, { "epoch": 0.3343960990247562, "grad_norm": 0.9117016792297363, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5349, "tokens_per_second_per_gpu": 17020.05, "total_tokens": 528081463 }, { "epoch": 0.3344586146536634, "grad_norm": 0.8977041244506836, "learning_rate": 2e-05, "loss": 0.6694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5350, "tokens_per_second_per_gpu": 16704.22, "total_tokens": 528179059 }, { "epoch": 0.33452113028257063, "grad_norm": 0.912532389163971, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5351, "tokens_per_second_per_gpu": 18491.07, "total_tokens": 528283871 }, { "epoch": 0.3345836459114779, "grad_norm": 0.884583592414856, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5352, "tokens_per_second_per_gpu": 17072.41, "total_tokens": 528381943 }, { "epoch": 0.3346461615403851, "grad_norm": 0.8993296027183533, "learning_rate": 2e-05, "loss": 0.7, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5353, "tokens_per_second_per_gpu": 17921.94, "total_tokens": 528482460 }, { "epoch": 0.3347086771692923, "grad_norm": 0.9160593152046204, "learning_rate": 2e-05, "loss": 0.7065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5354, "tokens_per_second_per_gpu": 17683.37, "total_tokens": 528581972 }, { "epoch": 0.33477119279819956, "grad_norm": 0.8758966326713562, "learning_rate": 2e-05, "loss": 0.6688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5355, "tokens_per_second_per_gpu": 18438.69, "total_tokens": 528682702 }, { "epoch": 0.3348337084271068, "grad_norm": 0.9164385795593262, "learning_rate": 2e-05, "loss": 0.6766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5356, "tokens_per_second_per_gpu": 17842.39, "total_tokens": 528783882 }, { "epoch": 0.334896224056014, "grad_norm": 0.8982093930244446, "learning_rate": 2e-05, "loss": 0.6661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5357, "tokens_per_second_per_gpu": 17069.45, "total_tokens": 528880983 }, { "epoch": 0.33495873968492124, "grad_norm": 0.8884450793266296, "learning_rate": 2e-05, "loss": 0.6844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5358, "tokens_per_second_per_gpu": 18203.28, "total_tokens": 528984710 }, { "epoch": 0.3350212553138285, "grad_norm": 0.8935660123825073, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5359, "tokens_per_second_per_gpu": 16305.08, "total_tokens": 529081886 }, { "epoch": 0.33508377094273567, "grad_norm": 0.9167789220809937, "learning_rate": 2e-05, "loss": 0.7517, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5360, "tokens_per_second_per_gpu": 17510.72, "total_tokens": 529185544 }, { "epoch": 0.3351462865716429, "grad_norm": 0.8641623854637146, "learning_rate": 2e-05, "loss": 0.6623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5361, "tokens_per_second_per_gpu": 17391.83, "total_tokens": 529285479 }, { "epoch": 0.33520880220055016, "grad_norm": 0.8876986503601074, "learning_rate": 2e-05, "loss": 0.6792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5362, "tokens_per_second_per_gpu": 17279.61, "total_tokens": 529383743 }, { "epoch": 0.33527131782945735, "grad_norm": 0.9252294301986694, "learning_rate": 2e-05, "loss": 0.6783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5363, "tokens_per_second_per_gpu": 17666.14, "total_tokens": 529481236 }, { "epoch": 0.3353338334583646, "grad_norm": 0.9478707313537598, "learning_rate": 2e-05, "loss": 0.7029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5364, "tokens_per_second_per_gpu": 18169.55, "total_tokens": 529586298 }, { "epoch": 0.33539634908727184, "grad_norm": 0.8921568393707275, "learning_rate": 2e-05, "loss": 0.7055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5365, "tokens_per_second_per_gpu": 17789.43, "total_tokens": 529688786 }, { "epoch": 0.33545886471617903, "grad_norm": 0.9119784832000732, "learning_rate": 2e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5366, "tokens_per_second_per_gpu": 17535.28, "total_tokens": 529788678 }, { "epoch": 0.3355213803450863, "grad_norm": 0.9355722069740295, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5367, "tokens_per_second_per_gpu": 17584.07, "total_tokens": 529888010 }, { "epoch": 0.3355838959739935, "grad_norm": 0.9120594263076782, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5368, "tokens_per_second_per_gpu": 18014.69, "total_tokens": 529988964 }, { "epoch": 0.3356464116029007, "grad_norm": 0.911699652671814, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5369, "tokens_per_second_per_gpu": 17609.82, "total_tokens": 530086967 }, { "epoch": 0.33570892723180795, "grad_norm": 0.8875431418418884, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5370, "tokens_per_second_per_gpu": 17339.0, "total_tokens": 530185676 }, { "epoch": 0.3357714428607152, "grad_norm": 0.9255253672599792, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5371, "tokens_per_second_per_gpu": 16873.23, "total_tokens": 530282571 }, { "epoch": 0.3358339584896224, "grad_norm": 0.9317464232444763, "learning_rate": 2e-05, "loss": 0.7058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5372, "tokens_per_second_per_gpu": 16895.7, "total_tokens": 530383142 }, { "epoch": 0.33589647411852963, "grad_norm": 0.928866446018219, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5373, "tokens_per_second_per_gpu": 18101.22, "total_tokens": 530481373 }, { "epoch": 0.3359589897474369, "grad_norm": 0.8821768760681152, "learning_rate": 2e-05, "loss": 0.6744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5374, "tokens_per_second_per_gpu": 17629.01, "total_tokens": 530584097 }, { "epoch": 0.33602150537634407, "grad_norm": 0.9166985154151917, "learning_rate": 2e-05, "loss": 0.6827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5375, "tokens_per_second_per_gpu": 17781.62, "total_tokens": 530682074 }, { "epoch": 0.3360840210052513, "grad_norm": 0.9147521257400513, "learning_rate": 2e-05, "loss": 0.6668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5376, "tokens_per_second_per_gpu": 17391.1, "total_tokens": 530782104 }, { "epoch": 0.33614653663415855, "grad_norm": 0.8689067363739014, "learning_rate": 2e-05, "loss": 0.6478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5377, "tokens_per_second_per_gpu": 18729.18, "total_tokens": 530883559 }, { "epoch": 0.33620905226306574, "grad_norm": 0.9087894558906555, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5378, "tokens_per_second_per_gpu": 18868.93, "total_tokens": 530986506 }, { "epoch": 0.336271567891973, "grad_norm": 0.9058757424354553, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5379, "tokens_per_second_per_gpu": 16703.39, "total_tokens": 531083789 }, { "epoch": 0.33633408352088023, "grad_norm": 0.8888129591941833, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5380, "tokens_per_second_per_gpu": 17291.09, "total_tokens": 531186524 }, { "epoch": 0.3363965991497874, "grad_norm": 0.8565376400947571, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5381, "tokens_per_second_per_gpu": 18087.1, "total_tokens": 531286132 }, { "epoch": 0.33645911477869467, "grad_norm": 0.915006160736084, "learning_rate": 2e-05, "loss": 0.6659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5382, "tokens_per_second_per_gpu": 19089.88, "total_tokens": 531390659 }, { "epoch": 0.3365216304076019, "grad_norm": 0.8656724095344543, "learning_rate": 2e-05, "loss": 0.6663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5383, "tokens_per_second_per_gpu": 17683.41, "total_tokens": 531489624 }, { "epoch": 0.3365841460365091, "grad_norm": 0.8940266966819763, "learning_rate": 2e-05, "loss": 0.71, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5384, "tokens_per_second_per_gpu": 17251.7, "total_tokens": 531592447 }, { "epoch": 0.33664666166541635, "grad_norm": 0.8818626403808594, "learning_rate": 2e-05, "loss": 0.6723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5385, "tokens_per_second_per_gpu": 17589.1, "total_tokens": 531693335 }, { "epoch": 0.3367091772943236, "grad_norm": 0.8908727765083313, "learning_rate": 2e-05, "loss": 0.6819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5386, "tokens_per_second_per_gpu": 17552.74, "total_tokens": 531791929 }, { "epoch": 0.3367716929232308, "grad_norm": 0.9684516191482544, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5387, "tokens_per_second_per_gpu": 17008.45, "total_tokens": 531891893 }, { "epoch": 0.336834208552138, "grad_norm": 0.9022843241691589, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5388, "tokens_per_second_per_gpu": 17491.04, "total_tokens": 531991043 }, { "epoch": 0.33689672418104527, "grad_norm": 0.8947890996932983, "learning_rate": 2e-05, "loss": 0.6979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5389, "tokens_per_second_per_gpu": 17632.54, "total_tokens": 532090437 }, { "epoch": 0.3369592398099525, "grad_norm": 0.9053913354873657, "learning_rate": 2e-05, "loss": 0.6576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5390, "tokens_per_second_per_gpu": 16650.24, "total_tokens": 532189165 }, { "epoch": 0.3370217554388597, "grad_norm": 0.9910836815834045, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5391, "tokens_per_second_per_gpu": 17034.04, "total_tokens": 532289224 }, { "epoch": 0.33708427106776695, "grad_norm": 0.892054557800293, "learning_rate": 2e-05, "loss": 0.6933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5392, "tokens_per_second_per_gpu": 17558.18, "total_tokens": 532388456 }, { "epoch": 0.3371467866966742, "grad_norm": 0.9252997040748596, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5393, "tokens_per_second_per_gpu": 16915.05, "total_tokens": 532482653 }, { "epoch": 0.3372093023255814, "grad_norm": 0.8984182476997375, "learning_rate": 2e-05, "loss": 0.6948, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5394, "tokens_per_second_per_gpu": 17434.85, "total_tokens": 532580971 }, { "epoch": 0.33727181795448863, "grad_norm": 0.9295763373374939, "learning_rate": 2e-05, "loss": 0.6738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5395, "tokens_per_second_per_gpu": 18798.53, "total_tokens": 532684327 }, { "epoch": 0.3373343335833959, "grad_norm": 0.8719924092292786, "learning_rate": 2e-05, "loss": 0.6813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5396, "tokens_per_second_per_gpu": 18424.98, "total_tokens": 532789521 }, { "epoch": 0.33739684921230306, "grad_norm": 0.9447962641716003, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5397, "tokens_per_second_per_gpu": 16052.86, "total_tokens": 532884257 }, { "epoch": 0.3374593648412103, "grad_norm": 0.8962081074714661, "learning_rate": 2e-05, "loss": 0.6959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5398, "tokens_per_second_per_gpu": 18648.62, "total_tokens": 532987909 }, { "epoch": 0.33752188047011755, "grad_norm": 0.8862172365188599, "learning_rate": 2e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5399, "tokens_per_second_per_gpu": 17893.82, "total_tokens": 533090572 }, { "epoch": 0.33758439609902474, "grad_norm": 0.9370237588882446, "learning_rate": 2e-05, "loss": 0.6772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5400, "tokens_per_second_per_gpu": 15794.07, "total_tokens": 533184157 }, { "epoch": 0.337646911727932, "grad_norm": 0.91343754529953, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5401, "tokens_per_second_per_gpu": 18470.31, "total_tokens": 533284586 }, { "epoch": 0.33770942735683923, "grad_norm": 0.8848538398742676, "learning_rate": 2e-05, "loss": 0.7089, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5402, "tokens_per_second_per_gpu": 18186.59, "total_tokens": 533387648 }, { "epoch": 0.3377719429857464, "grad_norm": 0.8833662271499634, "learning_rate": 2e-05, "loss": 0.6991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5403, "tokens_per_second_per_gpu": 18540.81, "total_tokens": 533489354 }, { "epoch": 0.33783445861465367, "grad_norm": 0.878718376159668, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5404, "tokens_per_second_per_gpu": 17600.25, "total_tokens": 533592550 }, { "epoch": 0.3378969742435609, "grad_norm": 0.8539886474609375, "learning_rate": 2e-05, "loss": 0.6566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5405, "tokens_per_second_per_gpu": 18287.82, "total_tokens": 533696030 }, { "epoch": 0.3379594898724681, "grad_norm": 0.8987839818000793, "learning_rate": 2e-05, "loss": 0.6614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5406, "tokens_per_second_per_gpu": 17321.7, "total_tokens": 533797715 }, { "epoch": 0.33802200550137534, "grad_norm": 0.8877841830253601, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5407, "tokens_per_second_per_gpu": 17713.56, "total_tokens": 533898042 }, { "epoch": 0.3380845211302826, "grad_norm": 0.9102402329444885, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5408, "tokens_per_second_per_gpu": 16564.57, "total_tokens": 533993549 }, { "epoch": 0.3381470367591898, "grad_norm": 0.8610270619392395, "learning_rate": 2e-05, "loss": 0.6538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5409, "tokens_per_second_per_gpu": 17408.85, "total_tokens": 534092033 }, { "epoch": 0.338209552388097, "grad_norm": 0.886917769908905, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5410, "tokens_per_second_per_gpu": 17461.09, "total_tokens": 534195518 }, { "epoch": 0.33827206801700427, "grad_norm": 0.9021059274673462, "learning_rate": 2e-05, "loss": 0.6626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5411, "tokens_per_second_per_gpu": 19012.68, "total_tokens": 534298997 }, { "epoch": 0.33833458364591146, "grad_norm": 0.8679525256156921, "learning_rate": 2e-05, "loss": 0.6659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5412, "tokens_per_second_per_gpu": 18079.98, "total_tokens": 534400835 }, { "epoch": 0.3383970992748187, "grad_norm": 0.933125913143158, "learning_rate": 2e-05, "loss": 0.676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5413, "tokens_per_second_per_gpu": 17562.14, "total_tokens": 534500567 }, { "epoch": 0.33845961490372595, "grad_norm": 0.9050900936126709, "learning_rate": 2e-05, "loss": 0.68, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5414, "tokens_per_second_per_gpu": 18335.85, "total_tokens": 534603181 }, { "epoch": 0.33852213053263314, "grad_norm": 0.8983616828918457, "learning_rate": 2e-05, "loss": 0.6649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5415, "tokens_per_second_per_gpu": 17608.93, "total_tokens": 534703749 }, { "epoch": 0.3385846461615404, "grad_norm": 0.9033164978027344, "learning_rate": 2e-05, "loss": 0.692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5416, "tokens_per_second_per_gpu": 17549.94, "total_tokens": 534803848 }, { "epoch": 0.3386471617904476, "grad_norm": 0.9065383672714233, "learning_rate": 2e-05, "loss": 0.7029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5417, "tokens_per_second_per_gpu": 18080.73, "total_tokens": 534905224 }, { "epoch": 0.3387096774193548, "grad_norm": 0.8815723061561584, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5418, "tokens_per_second_per_gpu": 17280.27, "total_tokens": 534999410 }, { "epoch": 0.33877219304826206, "grad_norm": 0.9089972376823425, "learning_rate": 2e-05, "loss": 0.6976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5419, "tokens_per_second_per_gpu": 17779.61, "total_tokens": 535101518 }, { "epoch": 0.3388347086771693, "grad_norm": 0.9083095788955688, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5420, "tokens_per_second_per_gpu": 16449.89, "total_tokens": 535198053 }, { "epoch": 0.3388972243060765, "grad_norm": 0.9063370823860168, "learning_rate": 2e-05, "loss": 0.622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5421, "tokens_per_second_per_gpu": 16807.9, "total_tokens": 535290883 }, { "epoch": 0.33895973993498374, "grad_norm": 0.8825885057449341, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5422, "tokens_per_second_per_gpu": 17286.81, "total_tokens": 535389974 }, { "epoch": 0.339022255563891, "grad_norm": 0.9086211919784546, "learning_rate": 2e-05, "loss": 0.668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5423, "tokens_per_second_per_gpu": 16874.45, "total_tokens": 535486936 }, { "epoch": 0.3390847711927982, "grad_norm": 0.8598430156707764, "learning_rate": 2e-05, "loss": 0.6394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5424, "tokens_per_second_per_gpu": 16385.14, "total_tokens": 535583925 }, { "epoch": 0.3391472868217054, "grad_norm": 0.8713620901107788, "learning_rate": 2e-05, "loss": 0.6815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5425, "tokens_per_second_per_gpu": 17836.91, "total_tokens": 535686629 }, { "epoch": 0.33920980245061266, "grad_norm": 0.8933026790618896, "learning_rate": 2e-05, "loss": 0.6886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5426, "tokens_per_second_per_gpu": 17570.85, "total_tokens": 535785205 }, { "epoch": 0.33927231807951985, "grad_norm": 0.8656491041183472, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5427, "tokens_per_second_per_gpu": 17964.09, "total_tokens": 535885047 }, { "epoch": 0.3393348337084271, "grad_norm": 0.8907063007354736, "learning_rate": 2e-05, "loss": 0.6946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5428, "tokens_per_second_per_gpu": 18185.69, "total_tokens": 535985750 }, { "epoch": 0.33939734933733434, "grad_norm": 0.8924597501754761, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5429, "tokens_per_second_per_gpu": 16793.31, "total_tokens": 536085021 }, { "epoch": 0.3394598649662416, "grad_norm": 0.9259805679321289, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5430, "tokens_per_second_per_gpu": 17631.06, "total_tokens": 536184857 }, { "epoch": 0.3395223805951488, "grad_norm": 0.8647100925445557, "learning_rate": 2e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5431, "tokens_per_second_per_gpu": 17875.71, "total_tokens": 536287843 }, { "epoch": 0.339584896224056, "grad_norm": 0.894356369972229, "learning_rate": 2e-05, "loss": 0.6688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5432, "tokens_per_second_per_gpu": 16764.83, "total_tokens": 536387145 }, { "epoch": 0.33964741185296327, "grad_norm": 0.9364796876907349, "learning_rate": 2e-05, "loss": 0.6487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5433, "tokens_per_second_per_gpu": 15874.54, "total_tokens": 536482518 }, { "epoch": 0.33970992748187046, "grad_norm": 0.9008806347846985, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5434, "tokens_per_second_per_gpu": 17713.69, "total_tokens": 536579169 }, { "epoch": 0.3397724431107777, "grad_norm": 0.8954789638519287, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5435, "tokens_per_second_per_gpu": 17665.93, "total_tokens": 536679513 }, { "epoch": 0.33983495873968494, "grad_norm": 0.9039112329483032, "learning_rate": 2e-05, "loss": 0.678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5436, "tokens_per_second_per_gpu": 17753.18, "total_tokens": 536781903 }, { "epoch": 0.33989747436859213, "grad_norm": 0.9124963283538818, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5437, "tokens_per_second_per_gpu": 17764.82, "total_tokens": 536881790 }, { "epoch": 0.3399599899974994, "grad_norm": 0.8912750482559204, "learning_rate": 2e-05, "loss": 0.6486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5438, "tokens_per_second_per_gpu": 17724.34, "total_tokens": 536982018 }, { "epoch": 0.3400225056264066, "grad_norm": 0.8935611844062805, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5439, "tokens_per_second_per_gpu": 17802.44, "total_tokens": 537083592 }, { "epoch": 0.3400850212553138, "grad_norm": 0.9231229424476624, "learning_rate": 2e-05, "loss": 0.7041, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5440, "tokens_per_second_per_gpu": 16675.79, "total_tokens": 537179656 }, { "epoch": 0.34014753688422106, "grad_norm": 0.9213036298751831, "learning_rate": 2e-05, "loss": 0.6699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5441, "tokens_per_second_per_gpu": 17798.9, "total_tokens": 537279822 }, { "epoch": 0.3402100525131283, "grad_norm": 0.8864755630493164, "learning_rate": 2e-05, "loss": 0.6859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5442, "tokens_per_second_per_gpu": 17571.18, "total_tokens": 537380182 }, { "epoch": 0.3402725681420355, "grad_norm": 0.8920334577560425, "learning_rate": 2e-05, "loss": 0.6843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5443, "tokens_per_second_per_gpu": 18271.77, "total_tokens": 537480819 }, { "epoch": 0.34033508377094274, "grad_norm": 0.9053800702095032, "learning_rate": 2e-05, "loss": 0.6682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5444, "tokens_per_second_per_gpu": 17265.78, "total_tokens": 537579126 }, { "epoch": 0.34039759939985, "grad_norm": 0.8832743167877197, "learning_rate": 2e-05, "loss": 0.6917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5445, "tokens_per_second_per_gpu": 16434.86, "total_tokens": 537675308 }, { "epoch": 0.34046011502875717, "grad_norm": 0.8814948201179504, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5446, "tokens_per_second_per_gpu": 17314.39, "total_tokens": 537773559 }, { "epoch": 0.3405226306576644, "grad_norm": 0.8819679617881775, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5447, "tokens_per_second_per_gpu": 17616.88, "total_tokens": 537873115 }, { "epoch": 0.34058514628657166, "grad_norm": 0.9536260366439819, "learning_rate": 2e-05, "loss": 0.6772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5448, "tokens_per_second_per_gpu": 17325.19, "total_tokens": 537970154 }, { "epoch": 0.34064766191547885, "grad_norm": 0.901835024356842, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5449, "tokens_per_second_per_gpu": 17515.72, "total_tokens": 538068250 }, { "epoch": 0.3407101775443861, "grad_norm": 0.8970900177955627, "learning_rate": 2e-05, "loss": 0.6996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5450, "tokens_per_second_per_gpu": 17938.12, "total_tokens": 538167547 }, { "epoch": 0.34077269317329334, "grad_norm": 0.944737434387207, "learning_rate": 2e-05, "loss": 0.6854, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5451, "tokens_per_second_per_gpu": 18231.29, "total_tokens": 538268957 }, { "epoch": 0.34083520880220053, "grad_norm": 0.887730598449707, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5452, "tokens_per_second_per_gpu": 17140.73, "total_tokens": 538367660 }, { "epoch": 0.3408977244311078, "grad_norm": 0.8966837525367737, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5453, "tokens_per_second_per_gpu": 18042.38, "total_tokens": 538467850 }, { "epoch": 0.340960240060015, "grad_norm": 0.9657477140426636, "learning_rate": 2e-05, "loss": 0.6815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5454, "tokens_per_second_per_gpu": 17299.06, "total_tokens": 538563724 }, { "epoch": 0.3410227556889222, "grad_norm": 0.9248191714286804, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5455, "tokens_per_second_per_gpu": 16477.95, "total_tokens": 538660064 }, { "epoch": 0.34108527131782945, "grad_norm": 0.8889015316963196, "learning_rate": 2e-05, "loss": 0.6656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5456, "tokens_per_second_per_gpu": 17500.44, "total_tokens": 538759574 }, { "epoch": 0.3411477869467367, "grad_norm": 0.9108323454856873, "learning_rate": 2e-05, "loss": 0.6753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5457, "tokens_per_second_per_gpu": 17543.49, "total_tokens": 538861279 }, { "epoch": 0.3412103025756439, "grad_norm": 0.9038654565811157, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5458, "tokens_per_second_per_gpu": 17424.11, "total_tokens": 538956221 }, { "epoch": 0.34127281820455113, "grad_norm": 0.8727227449417114, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5459, "tokens_per_second_per_gpu": 18722.18, "total_tokens": 539062138 }, { "epoch": 0.3413353338334584, "grad_norm": 0.9314501881599426, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5460, "tokens_per_second_per_gpu": 17512.17, "total_tokens": 539163137 }, { "epoch": 0.34139784946236557, "grad_norm": 0.8957692384719849, "learning_rate": 2e-05, "loss": 0.7029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5461, "tokens_per_second_per_gpu": 16784.93, "total_tokens": 539260565 }, { "epoch": 0.3414603650912728, "grad_norm": 0.8861040472984314, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5462, "tokens_per_second_per_gpu": 16143.35, "total_tokens": 539356310 }, { "epoch": 0.34152288072018006, "grad_norm": 0.8584674000740051, "learning_rate": 2e-05, "loss": 0.6297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5463, "tokens_per_second_per_gpu": 16757.04, "total_tokens": 539453231 }, { "epoch": 0.34158539634908724, "grad_norm": 0.9541844129562378, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5464, "tokens_per_second_per_gpu": 18143.43, "total_tokens": 539556905 }, { "epoch": 0.3416479119779945, "grad_norm": 0.8993721604347229, "learning_rate": 2e-05, "loss": 0.6933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5465, "tokens_per_second_per_gpu": 17035.08, "total_tokens": 539656773 }, { "epoch": 0.34171042760690173, "grad_norm": 0.901467502117157, "learning_rate": 2e-05, "loss": 0.69, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5466, "tokens_per_second_per_gpu": 17399.53, "total_tokens": 539756141 }, { "epoch": 0.341772943235809, "grad_norm": 0.9317559003829956, "learning_rate": 2e-05, "loss": 0.6765, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5467, "tokens_per_second_per_gpu": 17304.44, "total_tokens": 539855157 }, { "epoch": 0.34183545886471617, "grad_norm": 0.9061091542243958, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5468, "tokens_per_second_per_gpu": 15312.52, "total_tokens": 539946546 }, { "epoch": 0.3418979744936234, "grad_norm": 0.8958848118782043, "learning_rate": 2e-05, "loss": 0.6516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5469, "tokens_per_second_per_gpu": 17585.21, "total_tokens": 540044702 }, { "epoch": 0.34196049012253066, "grad_norm": 0.856674313545227, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5470, "tokens_per_second_per_gpu": 16409.33, "total_tokens": 540143169 }, { "epoch": 0.34202300575143785, "grad_norm": 0.8757954835891724, "learning_rate": 2e-05, "loss": 0.6992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5471, "tokens_per_second_per_gpu": 17475.99, "total_tokens": 540241839 }, { "epoch": 0.3420855213803451, "grad_norm": 0.9114265441894531, "learning_rate": 2e-05, "loss": 0.686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5472, "tokens_per_second_per_gpu": 17723.22, "total_tokens": 540343398 }, { "epoch": 0.34214803700925234, "grad_norm": 0.8995331525802612, "learning_rate": 2e-05, "loss": 0.7057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5473, "tokens_per_second_per_gpu": 17632.4, "total_tokens": 540446627 }, { "epoch": 0.3422105526381595, "grad_norm": 0.9377579092979431, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5474, "tokens_per_second_per_gpu": 17771.66, "total_tokens": 540544735 }, { "epoch": 0.34227306826706677, "grad_norm": 0.8808644413948059, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5475, "tokens_per_second_per_gpu": 18062.02, "total_tokens": 540645821 }, { "epoch": 0.342335583895974, "grad_norm": 0.9597553610801697, "learning_rate": 2e-05, "loss": 0.6701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5476, "tokens_per_second_per_gpu": 16809.08, "total_tokens": 540741945 }, { "epoch": 0.3423980995248812, "grad_norm": 0.9333469867706299, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5477, "tokens_per_second_per_gpu": 16583.52, "total_tokens": 540838092 }, { "epoch": 0.34246061515378845, "grad_norm": 0.8691417574882507, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5478, "tokens_per_second_per_gpu": 17674.12, "total_tokens": 540936386 }, { "epoch": 0.3425231307826957, "grad_norm": 0.9207940697669983, "learning_rate": 2e-05, "loss": 0.7149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5479, "tokens_per_second_per_gpu": 16550.14, "total_tokens": 541033718 }, { "epoch": 0.3425856464116029, "grad_norm": 0.8962507247924805, "learning_rate": 2e-05, "loss": 0.6866, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5480, "tokens_per_second_per_gpu": 18674.95, "total_tokens": 541134674 }, { "epoch": 0.34264816204051013, "grad_norm": 0.8995320200920105, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5481, "tokens_per_second_per_gpu": 15546.94, "total_tokens": 541230322 }, { "epoch": 0.3427106776694174, "grad_norm": 0.9023046493530273, "learning_rate": 2e-05, "loss": 0.6903, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5482, "tokens_per_second_per_gpu": 16504.58, "total_tokens": 541328746 }, { "epoch": 0.34277319329832456, "grad_norm": 0.9696767330169678, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5483, "tokens_per_second_per_gpu": 17304.15, "total_tokens": 541427066 }, { "epoch": 0.3428357089272318, "grad_norm": 0.9090365171432495, "learning_rate": 2e-05, "loss": 0.6878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5484, "tokens_per_second_per_gpu": 18518.61, "total_tokens": 541530184 }, { "epoch": 0.34289822455613905, "grad_norm": 0.9186379909515381, "learning_rate": 2e-05, "loss": 0.6653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5485, "tokens_per_second_per_gpu": 16567.42, "total_tokens": 541626744 }, { "epoch": 0.34296074018504624, "grad_norm": 0.9102254509925842, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5486, "tokens_per_second_per_gpu": 17270.27, "total_tokens": 541725655 }, { "epoch": 0.3430232558139535, "grad_norm": 0.919109046459198, "learning_rate": 2e-05, "loss": 0.6757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5487, "tokens_per_second_per_gpu": 17849.98, "total_tokens": 541828337 }, { "epoch": 0.34308577144286073, "grad_norm": 0.9439157247543335, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5488, "tokens_per_second_per_gpu": 17103.01, "total_tokens": 541924446 }, { "epoch": 0.3431482870717679, "grad_norm": 0.8977526426315308, "learning_rate": 2e-05, "loss": 0.6917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5489, "tokens_per_second_per_gpu": 17736.59, "total_tokens": 542027170 }, { "epoch": 0.34321080270067517, "grad_norm": 0.9252673983573914, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5490, "tokens_per_second_per_gpu": 17259.38, "total_tokens": 542126047 }, { "epoch": 0.3432733183295824, "grad_norm": 0.8990893959999084, "learning_rate": 2e-05, "loss": 0.6856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5491, "tokens_per_second_per_gpu": 8470.06, "total_tokens": 542229084 }, { "epoch": 0.3433358339584896, "grad_norm": 0.8502176403999329, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5492, "tokens_per_second_per_gpu": 7558.38, "total_tokens": 542326920 }, { "epoch": 0.34339834958739685, "grad_norm": 0.8947775363922119, "learning_rate": 2e-05, "loss": 0.6735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5493, "tokens_per_second_per_gpu": 7536.03, "total_tokens": 542425131 }, { "epoch": 0.3434608652163041, "grad_norm": 0.9085193872451782, "learning_rate": 2e-05, "loss": 0.687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5494, "tokens_per_second_per_gpu": 7470.24, "total_tokens": 542526635 }, { "epoch": 0.3435233808452113, "grad_norm": 0.9456030130386353, "learning_rate": 2e-05, "loss": 0.7138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5495, "tokens_per_second_per_gpu": 7703.24, "total_tokens": 542628370 }, { "epoch": 0.3435858964741185, "grad_norm": 0.9297339916229248, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5496, "tokens_per_second_per_gpu": 7095.49, "total_tokens": 542725578 }, { "epoch": 0.34364841210302577, "grad_norm": 0.9124757051467896, "learning_rate": 2e-05, "loss": 0.6811, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5497, "tokens_per_second_per_gpu": 7042.23, "total_tokens": 542823705 }, { "epoch": 0.34371092773193296, "grad_norm": 0.8588289618492126, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5498, "tokens_per_second_per_gpu": 7669.03, "total_tokens": 542922111 }, { "epoch": 0.3437734433608402, "grad_norm": 0.8558123111724854, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5499, "tokens_per_second_per_gpu": 7245.31, "total_tokens": 543024074 }, { "epoch": 0.34383595898974745, "grad_norm": 0.8966588377952576, "learning_rate": 2e-05, "loss": 0.6652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5500, "tokens_per_second_per_gpu": 9272.62, "total_tokens": 543125361 }, { "epoch": 0.34389847461865464, "grad_norm": 0.921669602394104, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5501, "tokens_per_second_per_gpu": 17265.57, "total_tokens": 543223998 }, { "epoch": 0.3439609902475619, "grad_norm": 0.9014936089515686, "learning_rate": 2e-05, "loss": 0.7036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5502, "tokens_per_second_per_gpu": 7474.54, "total_tokens": 543321993 }, { "epoch": 0.3440235058764691, "grad_norm": 0.8635323643684387, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5503, "tokens_per_second_per_gpu": 7405.42, "total_tokens": 543420289 }, { "epoch": 0.34408602150537637, "grad_norm": 0.8866659998893738, "learning_rate": 2e-05, "loss": 0.6473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5504, "tokens_per_second_per_gpu": 7590.66, "total_tokens": 543517365 }, { "epoch": 0.34414853713428356, "grad_norm": 0.897072970867157, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5505, "tokens_per_second_per_gpu": 7540.55, "total_tokens": 543613888 }, { "epoch": 0.3442110527631908, "grad_norm": 0.902717649936676, "learning_rate": 2e-05, "loss": 0.6826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5506, "tokens_per_second_per_gpu": 7902.98, "total_tokens": 543715817 }, { "epoch": 0.34427356839209805, "grad_norm": 0.9426490068435669, "learning_rate": 2e-05, "loss": 0.6821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5507, "tokens_per_second_per_gpu": 7785.44, "total_tokens": 543817755 }, { "epoch": 0.34433608402100524, "grad_norm": 0.8610023260116577, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5508, "tokens_per_second_per_gpu": 7981.12, "total_tokens": 543920124 }, { "epoch": 0.3443985996499125, "grad_norm": 0.8859238028526306, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5509, "tokens_per_second_per_gpu": 7837.63, "total_tokens": 544018222 }, { "epoch": 0.34446111527881973, "grad_norm": 0.8854931592941284, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5510, "tokens_per_second_per_gpu": 7339.84, "total_tokens": 544115366 }, { "epoch": 0.3445236309077269, "grad_norm": 0.8927059769630432, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5511, "tokens_per_second_per_gpu": 9731.84, "total_tokens": 544212250 }, { "epoch": 0.34458614653663416, "grad_norm": 0.8745449781417847, "learning_rate": 2e-05, "loss": 0.6168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5512, "tokens_per_second_per_gpu": 13834.53, "total_tokens": 544309885 }, { "epoch": 0.3446486621655414, "grad_norm": 0.8851749300956726, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5513, "tokens_per_second_per_gpu": 7350.91, "total_tokens": 544404834 }, { "epoch": 0.3447111777944486, "grad_norm": 0.9545820951461792, "learning_rate": 2e-05, "loss": 0.6836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5514, "tokens_per_second_per_gpu": 7436.08, "total_tokens": 544505885 }, { "epoch": 0.34477369342335584, "grad_norm": 0.8780264854431152, "learning_rate": 2e-05, "loss": 0.668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5515, "tokens_per_second_per_gpu": 7799.87, "total_tokens": 544607282 }, { "epoch": 0.3448362090522631, "grad_norm": 0.9933829307556152, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5516, "tokens_per_second_per_gpu": 7272.88, "total_tokens": 544702225 }, { "epoch": 0.3448987246811703, "grad_norm": 0.8920584917068481, "learning_rate": 2e-05, "loss": 0.6627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5517, "tokens_per_second_per_gpu": 7832.39, "total_tokens": 544805567 }, { "epoch": 0.3449612403100775, "grad_norm": 0.9394065141677856, "learning_rate": 2e-05, "loss": 0.6805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5518, "tokens_per_second_per_gpu": 7567.1, "total_tokens": 544904791 }, { "epoch": 0.34502375593898477, "grad_norm": 1.1238666772842407, "learning_rate": 2e-05, "loss": 0.6843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5519, "tokens_per_second_per_gpu": 7363.59, "total_tokens": 545003087 }, { "epoch": 0.34508627156789196, "grad_norm": 0.8984625339508057, "learning_rate": 2e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5520, "tokens_per_second_per_gpu": 7587.31, "total_tokens": 545103862 }, { "epoch": 0.3451487871967992, "grad_norm": 0.900342583656311, "learning_rate": 2e-05, "loss": 0.6692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5521, "tokens_per_second_per_gpu": 7818.11, "total_tokens": 545203812 }, { "epoch": 0.34521130282570645, "grad_norm": 0.9089477062225342, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5522, "tokens_per_second_per_gpu": 11370.74, "total_tokens": 545300771 }, { "epoch": 0.34527381845461363, "grad_norm": 0.9271345138549805, "learning_rate": 2e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5523, "tokens_per_second_per_gpu": 10842.89, "total_tokens": 545395970 }, { "epoch": 0.3453363340835209, "grad_norm": 0.9141470193862915, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5524, "tokens_per_second_per_gpu": 7404.19, "total_tokens": 545493056 }, { "epoch": 0.3453988497124281, "grad_norm": 0.8939403891563416, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5525, "tokens_per_second_per_gpu": 8009.94, "total_tokens": 545593579 }, { "epoch": 0.3454613653413353, "grad_norm": 0.9078317284584045, "learning_rate": 2e-05, "loss": 0.7652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5526, "tokens_per_second_per_gpu": 7740.73, "total_tokens": 545696172 }, { "epoch": 0.34552388097024256, "grad_norm": 0.8531894087791443, "learning_rate": 2e-05, "loss": 0.6729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5527, "tokens_per_second_per_gpu": 7797.19, "total_tokens": 545798551 }, { "epoch": 0.3455863965991498, "grad_norm": 0.9144214391708374, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5528, "tokens_per_second_per_gpu": 8016.89, "total_tokens": 545898893 }, { "epoch": 0.345648912228057, "grad_norm": 0.9255946278572083, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5529, "tokens_per_second_per_gpu": 6940.38, "total_tokens": 545993860 }, { "epoch": 0.34571142785696424, "grad_norm": 0.8837077021598816, "learning_rate": 2e-05, "loss": 0.6769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5530, "tokens_per_second_per_gpu": 7986.08, "total_tokens": 546092881 }, { "epoch": 0.3457739434858715, "grad_norm": 0.8983625173568726, "learning_rate": 2e-05, "loss": 0.6825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5531, "tokens_per_second_per_gpu": 6951.5, "total_tokens": 546191352 }, { "epoch": 0.34583645911477867, "grad_norm": 0.9003201723098755, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5532, "tokens_per_second_per_gpu": 7584.65, "total_tokens": 546286707 }, { "epoch": 0.3458989747436859, "grad_norm": 0.9163950681686401, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5533, "tokens_per_second_per_gpu": 14575.54, "total_tokens": 546386639 }, { "epoch": 0.34596149037259316, "grad_norm": 0.9607660174369812, "learning_rate": 2e-05, "loss": 0.6699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5534, "tokens_per_second_per_gpu": 9971.69, "total_tokens": 546484361 }, { "epoch": 0.34602400600150035, "grad_norm": 0.8806659579277039, "learning_rate": 2e-05, "loss": 0.659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5535, "tokens_per_second_per_gpu": 7651.86, "total_tokens": 546584719 }, { "epoch": 0.3460865216304076, "grad_norm": 0.9248536825180054, "learning_rate": 2e-05, "loss": 0.7259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5536, "tokens_per_second_per_gpu": 7544.94, "total_tokens": 546684574 }, { "epoch": 0.34614903725931484, "grad_norm": 0.887470543384552, "learning_rate": 2e-05, "loss": 0.6663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5537, "tokens_per_second_per_gpu": 7514.92, "total_tokens": 546784064 }, { "epoch": 0.34621155288822203, "grad_norm": 0.9419185519218445, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5538, "tokens_per_second_per_gpu": 6631.71, "total_tokens": 546874945 }, { "epoch": 0.3462740685171293, "grad_norm": 0.8830046057701111, "learning_rate": 2e-05, "loss": 0.6283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5539, "tokens_per_second_per_gpu": 7958.38, "total_tokens": 546973913 }, { "epoch": 0.3463365841460365, "grad_norm": 0.8812766671180725, "learning_rate": 2e-05, "loss": 0.6739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5540, "tokens_per_second_per_gpu": 7930.59, "total_tokens": 547075960 }, { "epoch": 0.3463990997749437, "grad_norm": 0.8892247080802917, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5541, "tokens_per_second_per_gpu": 7865.99, "total_tokens": 547178335 }, { "epoch": 0.34646161540385095, "grad_norm": 0.9247798919677734, "learning_rate": 2e-05, "loss": 0.6878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5542, "tokens_per_second_per_gpu": 7785.34, "total_tokens": 547278054 }, { "epoch": 0.3465241310327582, "grad_norm": 0.8791472911834717, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5543, "tokens_per_second_per_gpu": 7801.9, "total_tokens": 547374887 }, { "epoch": 0.34658664666166544, "grad_norm": 0.9339980483055115, "learning_rate": 2e-05, "loss": 0.6468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5544, "tokens_per_second_per_gpu": 14896.88, "total_tokens": 547464886 }, { "epoch": 0.34664916229057263, "grad_norm": 0.9165722131729126, "learning_rate": 2e-05, "loss": 0.702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5545, "tokens_per_second_per_gpu": 8884.91, "total_tokens": 547568488 }, { "epoch": 0.3467116779194799, "grad_norm": 0.9315817356109619, "learning_rate": 2e-05, "loss": 0.6814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5546, "tokens_per_second_per_gpu": 8023.11, "total_tokens": 547671981 }, { "epoch": 0.3467741935483871, "grad_norm": 0.9316245317459106, "learning_rate": 2e-05, "loss": 0.6948, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5547, "tokens_per_second_per_gpu": 8136.84, "total_tokens": 547778758 }, { "epoch": 0.3468367091772943, "grad_norm": 0.8949649333953857, "learning_rate": 2e-05, "loss": 0.6489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5548, "tokens_per_second_per_gpu": 7725.96, "total_tokens": 547876884 }, { "epoch": 0.34689922480620156, "grad_norm": 0.9092408418655396, "learning_rate": 2e-05, "loss": 0.6835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5549, "tokens_per_second_per_gpu": 7551.66, "total_tokens": 547973528 }, { "epoch": 0.3469617404351088, "grad_norm": 0.96659916639328, "learning_rate": 2e-05, "loss": 0.7152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5550, "tokens_per_second_per_gpu": 7483.57, "total_tokens": 548069186 }, { "epoch": 0.347024256064016, "grad_norm": 0.9335165023803711, "learning_rate": 2e-05, "loss": 0.6824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5551, "tokens_per_second_per_gpu": 8126.45, "total_tokens": 548170649 }, { "epoch": 0.34708677169292323, "grad_norm": 0.9056077599525452, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5552, "tokens_per_second_per_gpu": 7724.64, "total_tokens": 548269355 }, { "epoch": 0.3471492873218305, "grad_norm": 0.9214127063751221, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5553, "tokens_per_second_per_gpu": 7519.92, "total_tokens": 548367002 }, { "epoch": 0.34721180295073767, "grad_norm": 0.9040054082870483, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5554, "tokens_per_second_per_gpu": 8936.32, "total_tokens": 548464288 }, { "epoch": 0.3472743185796449, "grad_norm": 0.9749209880828857, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5555, "tokens_per_second_per_gpu": 16685.49, "total_tokens": 548562125 }, { "epoch": 0.34733683420855216, "grad_norm": 0.8454492688179016, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5556, "tokens_per_second_per_gpu": 7534.17, "total_tokens": 548661441 }, { "epoch": 0.34739934983745935, "grad_norm": 0.8942367434501648, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5557, "tokens_per_second_per_gpu": 6792.25, "total_tokens": 548751668 }, { "epoch": 0.3474618654663666, "grad_norm": 0.9367130398750305, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5558, "tokens_per_second_per_gpu": 7608.28, "total_tokens": 548847550 }, { "epoch": 0.34752438109527384, "grad_norm": 0.886565089225769, "learning_rate": 2e-05, "loss": 0.6859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5559, "tokens_per_second_per_gpu": 7831.04, "total_tokens": 548946947 }, { "epoch": 0.347586896724181, "grad_norm": 0.8584948182106018, "learning_rate": 2e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5560, "tokens_per_second_per_gpu": 7715.4, "total_tokens": 549048724 }, { "epoch": 0.34764941235308827, "grad_norm": 0.8976364135742188, "learning_rate": 2e-05, "loss": 0.659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5561, "tokens_per_second_per_gpu": 8102.05, "total_tokens": 549147363 }, { "epoch": 0.3477119279819955, "grad_norm": 0.8897249102592468, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5562, "tokens_per_second_per_gpu": 7644.6, "total_tokens": 549244439 }, { "epoch": 0.3477744436109027, "grad_norm": 0.8892229795455933, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5563, "tokens_per_second_per_gpu": 8384.19, "total_tokens": 549345873 }, { "epoch": 0.34783695923980995, "grad_norm": 0.9191032648086548, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5564, "tokens_per_second_per_gpu": 7799.86, "total_tokens": 549442667 }, { "epoch": 0.3478994748687172, "grad_norm": 0.9180387854576111, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5565, "tokens_per_second_per_gpu": 11081.11, "total_tokens": 549539750 }, { "epoch": 0.3479619904976244, "grad_norm": 0.8998047113418579, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5566, "tokens_per_second_per_gpu": 15586.25, "total_tokens": 549636305 }, { "epoch": 0.34802450612653163, "grad_norm": 0.9096084237098694, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5567, "tokens_per_second_per_gpu": 7205.46, "total_tokens": 549733974 }, { "epoch": 0.3480870217554389, "grad_norm": 0.9279536008834839, "learning_rate": 2e-05, "loss": 0.6687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5568, "tokens_per_second_per_gpu": 7571.24, "total_tokens": 549828885 }, { "epoch": 0.34814953738434606, "grad_norm": 0.8870232105255127, "learning_rate": 2e-05, "loss": 0.6455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5569, "tokens_per_second_per_gpu": 7608.62, "total_tokens": 549926947 }, { "epoch": 0.3482120530132533, "grad_norm": 0.9350450038909912, "learning_rate": 2e-05, "loss": 0.6977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5570, "tokens_per_second_per_gpu": 8077.72, "total_tokens": 550029652 }, { "epoch": 0.34827456864216055, "grad_norm": 0.9171619415283203, "learning_rate": 2e-05, "loss": 0.6928, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5571, "tokens_per_second_per_gpu": 7834.15, "total_tokens": 550133803 }, { "epoch": 0.34833708427106774, "grad_norm": 0.9121278524398804, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5572, "tokens_per_second_per_gpu": 8229.67, "total_tokens": 550233849 }, { "epoch": 0.348399599899975, "grad_norm": 0.8917664289474487, "learning_rate": 2e-05, "loss": 0.6743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5573, "tokens_per_second_per_gpu": 7572.66, "total_tokens": 550333259 }, { "epoch": 0.34846211552888223, "grad_norm": 0.9004389047622681, "learning_rate": 2e-05, "loss": 0.6685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5574, "tokens_per_second_per_gpu": 7773.58, "total_tokens": 550430980 }, { "epoch": 0.3485246311577894, "grad_norm": 0.8833870887756348, "learning_rate": 2e-05, "loss": 0.6804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5575, "tokens_per_second_per_gpu": 7753.45, "total_tokens": 550528974 }, { "epoch": 0.34858714678669667, "grad_norm": 0.8783833980560303, "learning_rate": 2e-05, "loss": 0.6793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5576, "tokens_per_second_per_gpu": 11693.97, "total_tokens": 550631051 }, { "epoch": 0.3486496624156039, "grad_norm": 0.9258441925048828, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5577, "tokens_per_second_per_gpu": 13511.16, "total_tokens": 550726460 }, { "epoch": 0.3487121780445111, "grad_norm": 0.9013025164604187, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5578, "tokens_per_second_per_gpu": 7602.94, "total_tokens": 550824132 }, { "epoch": 0.34877469367341835, "grad_norm": 0.898826003074646, "learning_rate": 2e-05, "loss": 0.6839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5579, "tokens_per_second_per_gpu": 8133.48, "total_tokens": 550925291 }, { "epoch": 0.3488372093023256, "grad_norm": 0.9067687392234802, "learning_rate": 2e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5580, "tokens_per_second_per_gpu": 7518.77, "total_tokens": 551023640 }, { "epoch": 0.34889972493123284, "grad_norm": 0.9065192937850952, "learning_rate": 2e-05, "loss": 0.681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5581, "tokens_per_second_per_gpu": 7646.7, "total_tokens": 551125846 }, { "epoch": 0.34896224056014, "grad_norm": 0.927325427532196, "learning_rate": 2e-05, "loss": 0.667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5582, "tokens_per_second_per_gpu": 8032.36, "total_tokens": 551221369 }, { "epoch": 0.34902475618904727, "grad_norm": 0.9570987820625305, "learning_rate": 2e-05, "loss": 0.6922, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5583, "tokens_per_second_per_gpu": 7280.16, "total_tokens": 551319704 }, { "epoch": 0.3490872718179545, "grad_norm": 0.8644220232963562, "learning_rate": 2e-05, "loss": 0.6657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5584, "tokens_per_second_per_gpu": 7882.13, "total_tokens": 551423933 }, { "epoch": 0.3491497874468617, "grad_norm": 0.9114165902137756, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5585, "tokens_per_second_per_gpu": 7548.76, "total_tokens": 551519552 }, { "epoch": 0.34921230307576895, "grad_norm": 0.9214586615562439, "learning_rate": 2e-05, "loss": 0.694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5586, "tokens_per_second_per_gpu": 7802.08, "total_tokens": 551617629 }, { "epoch": 0.3492748187046762, "grad_norm": 0.907128095626831, "learning_rate": 2e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5587, "tokens_per_second_per_gpu": 12847.89, "total_tokens": 551714573 }, { "epoch": 0.3493373343335834, "grad_norm": 0.8953025937080383, "learning_rate": 2e-05, "loss": 0.6588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5588, "tokens_per_second_per_gpu": 11352.1, "total_tokens": 551815511 }, { "epoch": 0.3493998499624906, "grad_norm": 0.9034217000007629, "learning_rate": 2e-05, "loss": 0.689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5589, "tokens_per_second_per_gpu": 7751.54, "total_tokens": 551913998 }, { "epoch": 0.34946236559139787, "grad_norm": 0.8909643292427063, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5590, "tokens_per_second_per_gpu": 7286.28, "total_tokens": 552007261 }, { "epoch": 0.34952488122030506, "grad_norm": 0.9221689701080322, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5591, "tokens_per_second_per_gpu": 7519.81, "total_tokens": 552104608 }, { "epoch": 0.3495873968492123, "grad_norm": 0.9047439694404602, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5592, "tokens_per_second_per_gpu": 8146.12, "total_tokens": 552206072 }, { "epoch": 0.34964991247811955, "grad_norm": 0.8606290221214294, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5593, "tokens_per_second_per_gpu": 8115.18, "total_tokens": 552303338 }, { "epoch": 0.34971242810702674, "grad_norm": 0.9064872860908508, "learning_rate": 2e-05, "loss": 0.6889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5594, "tokens_per_second_per_gpu": 7573.23, "total_tokens": 552405935 }, { "epoch": 0.349774943735934, "grad_norm": 0.9054933786392212, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5595, "tokens_per_second_per_gpu": 7677.62, "total_tokens": 552507655 }, { "epoch": 0.34983745936484123, "grad_norm": 0.8489993810653687, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5596, "tokens_per_second_per_gpu": 7555.65, "total_tokens": 552607251 }, { "epoch": 0.3498999749937484, "grad_norm": 0.9024850130081177, "learning_rate": 2e-05, "loss": 0.6926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5597, "tokens_per_second_per_gpu": 7172.16, "total_tokens": 552702912 }, { "epoch": 0.34996249062265566, "grad_norm": 0.8788970112800598, "learning_rate": 2e-05, "loss": 0.6932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5598, "tokens_per_second_per_gpu": 16585.24, "total_tokens": 552802635 }, { "epoch": 0.3500250062515629, "grad_norm": 0.890140950679779, "learning_rate": 2e-05, "loss": 0.6649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5599, "tokens_per_second_per_gpu": 8785.23, "total_tokens": 552903158 }, { "epoch": 0.3500875218804701, "grad_norm": 0.8856598138809204, "learning_rate": 2e-05, "loss": 0.6679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5600, "tokens_per_second_per_gpu": 7856.75, "total_tokens": 553003998 }, { "epoch": 0.35015003750937734, "grad_norm": 0.9110183119773865, "learning_rate": 2e-05, "loss": 0.6668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5601, "tokens_per_second_per_gpu": 7915.97, "total_tokens": 553104744 }, { "epoch": 0.3502125531382846, "grad_norm": 0.8848443627357483, "learning_rate": 2e-05, "loss": 0.6568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5602, "tokens_per_second_per_gpu": 7805.39, "total_tokens": 553202953 }, { "epoch": 0.3502750687671918, "grad_norm": 0.8836299777030945, "learning_rate": 2e-05, "loss": 0.6982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5603, "tokens_per_second_per_gpu": 7792.0, "total_tokens": 553301922 }, { "epoch": 0.350337584396099, "grad_norm": 0.926740825176239, "learning_rate": 2e-05, "loss": 0.6455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5604, "tokens_per_second_per_gpu": 7090.93, "total_tokens": 553398434 }, { "epoch": 0.35040010002500627, "grad_norm": 0.9161000847816467, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5605, "tokens_per_second_per_gpu": 7552.38, "total_tokens": 553495777 }, { "epoch": 0.35046261565391346, "grad_norm": 0.9196637868881226, "learning_rate": 2e-05, "loss": 0.6876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5606, "tokens_per_second_per_gpu": 7632.57, "total_tokens": 553592680 }, { "epoch": 0.3505251312828207, "grad_norm": 0.8891096711158752, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5607, "tokens_per_second_per_gpu": 7720.31, "total_tokens": 553691689 }, { "epoch": 0.35058764691172795, "grad_norm": 0.9158166646957397, "learning_rate": 2e-05, "loss": 0.6865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5608, "tokens_per_second_per_gpu": 8407.32, "total_tokens": 553789491 }, { "epoch": 0.35065016254063514, "grad_norm": 0.885150671005249, "learning_rate": 2e-05, "loss": 0.6783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5609, "tokens_per_second_per_gpu": 18600.29, "total_tokens": 553894707 }, { "epoch": 0.3507126781695424, "grad_norm": 0.8727560043334961, "learning_rate": 2e-05, "loss": 0.6895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5610, "tokens_per_second_per_gpu": 8345.54, "total_tokens": 553995003 }, { "epoch": 0.3507751937984496, "grad_norm": 0.9005116820335388, "learning_rate": 2e-05, "loss": 0.6866, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5611, "tokens_per_second_per_gpu": 7724.01, "total_tokens": 554096052 }, { "epoch": 0.3508377094273568, "grad_norm": 0.894523024559021, "learning_rate": 2e-05, "loss": 0.6726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5612, "tokens_per_second_per_gpu": 7478.5, "total_tokens": 554195582 }, { "epoch": 0.35090022505626406, "grad_norm": 0.8946406841278076, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5613, "tokens_per_second_per_gpu": 7411.11, "total_tokens": 554293749 }, { "epoch": 0.3509627406851713, "grad_norm": 0.9151350259780884, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5614, "tokens_per_second_per_gpu": 7375.52, "total_tokens": 554390802 }, { "epoch": 0.3510252563140785, "grad_norm": 0.9384749531745911, "learning_rate": 2e-05, "loss": 0.6848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5615, "tokens_per_second_per_gpu": 7955.18, "total_tokens": 554491288 }, { "epoch": 0.35108777194298574, "grad_norm": 0.9062789678573608, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5616, "tokens_per_second_per_gpu": 7057.85, "total_tokens": 554587936 }, { "epoch": 0.351150287571893, "grad_norm": 0.9469649195671082, "learning_rate": 2e-05, "loss": 0.7258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5617, "tokens_per_second_per_gpu": 8311.99, "total_tokens": 554692787 }, { "epoch": 0.3512128032008002, "grad_norm": 0.936226487159729, "learning_rate": 2e-05, "loss": 0.6899, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5618, "tokens_per_second_per_gpu": 8065.14, "total_tokens": 554793300 }, { "epoch": 0.3512753188297074, "grad_norm": 0.892071008682251, "learning_rate": 2e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5619, "tokens_per_second_per_gpu": 10098.37, "total_tokens": 554894664 }, { "epoch": 0.35133783445861466, "grad_norm": 0.9463286399841309, "learning_rate": 2e-05, "loss": 0.6647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5620, "tokens_per_second_per_gpu": 17177.78, "total_tokens": 554992092 }, { "epoch": 0.3514003500875219, "grad_norm": 0.9414716958999634, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5621, "tokens_per_second_per_gpu": 17678.31, "total_tokens": 555084950 }, { "epoch": 0.3514628657164291, "grad_norm": 0.8941126465797424, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5622, "tokens_per_second_per_gpu": 16883.92, "total_tokens": 555181577 }, { "epoch": 0.35152538134533634, "grad_norm": 0.9060514569282532, "learning_rate": 2e-05, "loss": 0.7169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5623, "tokens_per_second_per_gpu": 19461.92, "total_tokens": 555286907 }, { "epoch": 0.3515878969742436, "grad_norm": 0.9004878401756287, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5624, "tokens_per_second_per_gpu": 18260.51, "total_tokens": 555385847 }, { "epoch": 0.3516504126031508, "grad_norm": 0.8887948989868164, "learning_rate": 2e-05, "loss": 0.5846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5625, "tokens_per_second_per_gpu": 16707.54, "total_tokens": 555477378 }, { "epoch": 0.351712928232058, "grad_norm": 0.8720179796218872, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5626, "tokens_per_second_per_gpu": 17590.66, "total_tokens": 555576571 }, { "epoch": 0.35177544386096526, "grad_norm": 0.89646977186203, "learning_rate": 2e-05, "loss": 0.7097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5627, "tokens_per_second_per_gpu": 18515.03, "total_tokens": 555682667 }, { "epoch": 0.35183795948987245, "grad_norm": 0.894821047782898, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5628, "tokens_per_second_per_gpu": 18457.72, "total_tokens": 555783906 }, { "epoch": 0.3519004751187797, "grad_norm": 0.8795440793037415, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5629, "tokens_per_second_per_gpu": 18052.99, "total_tokens": 555882768 }, { "epoch": 0.35196299074768694, "grad_norm": 0.9728223085403442, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5630, "tokens_per_second_per_gpu": 16449.36, "total_tokens": 555974212 }, { "epoch": 0.35202550637659413, "grad_norm": 0.8633262515068054, "learning_rate": 2e-05, "loss": 0.6695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5631, "tokens_per_second_per_gpu": 17892.35, "total_tokens": 556076709 }, { "epoch": 0.3520880220055014, "grad_norm": 0.9224250912666321, "learning_rate": 2e-05, "loss": 0.7051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5632, "tokens_per_second_per_gpu": 18041.07, "total_tokens": 556176907 }, { "epoch": 0.3521505376344086, "grad_norm": 0.9347554445266724, "learning_rate": 2e-05, "loss": 0.6495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5633, "tokens_per_second_per_gpu": 16740.32, "total_tokens": 556271160 }, { "epoch": 0.3522130532633158, "grad_norm": 0.9337344169616699, "learning_rate": 2e-05, "loss": 0.6423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5634, "tokens_per_second_per_gpu": 18432.51, "total_tokens": 556368528 }, { "epoch": 0.35227556889222306, "grad_norm": 0.9171221852302551, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5635, "tokens_per_second_per_gpu": 15503.44, "total_tokens": 556462200 }, { "epoch": 0.3523380845211303, "grad_norm": 0.8905045390129089, "learning_rate": 2e-05, "loss": 0.6982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5636, "tokens_per_second_per_gpu": 18448.09, "total_tokens": 556562277 }, { "epoch": 0.3524006001500375, "grad_norm": 0.9034264087677002, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5637, "tokens_per_second_per_gpu": 17471.67, "total_tokens": 556658826 }, { "epoch": 0.35246311577894474, "grad_norm": 0.9307233691215515, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5638, "tokens_per_second_per_gpu": 17619.87, "total_tokens": 556757767 }, { "epoch": 0.352525631407852, "grad_norm": 0.883926510810852, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5639, "tokens_per_second_per_gpu": 18757.31, "total_tokens": 556858878 }, { "epoch": 0.35258814703675917, "grad_norm": 0.8792882561683655, "learning_rate": 2e-05, "loss": 0.6782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5640, "tokens_per_second_per_gpu": 17594.65, "total_tokens": 556959458 }, { "epoch": 0.3526506626656664, "grad_norm": 0.8873811364173889, "learning_rate": 2e-05, "loss": 0.6843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5641, "tokens_per_second_per_gpu": 17665.92, "total_tokens": 557061218 }, { "epoch": 0.35271317829457366, "grad_norm": 0.9121993780136108, "learning_rate": 2e-05, "loss": 0.6868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5642, "tokens_per_second_per_gpu": 18214.17, "total_tokens": 557161443 }, { "epoch": 0.35277569392348085, "grad_norm": 0.8851265907287598, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5643, "tokens_per_second_per_gpu": 17112.62, "total_tokens": 557258611 }, { "epoch": 0.3528382095523881, "grad_norm": 0.9261267185211182, "learning_rate": 2e-05, "loss": 0.6709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5644, "tokens_per_second_per_gpu": 17268.89, "total_tokens": 557356174 }, { "epoch": 0.35290072518129534, "grad_norm": 0.9163837432861328, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5645, "tokens_per_second_per_gpu": 16860.62, "total_tokens": 557453692 }, { "epoch": 0.3529632408102025, "grad_norm": 0.8558416962623596, "learning_rate": 2e-05, "loss": 0.6535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5646, "tokens_per_second_per_gpu": 17993.63, "total_tokens": 557554921 }, { "epoch": 0.3530257564391098, "grad_norm": 0.9084020853042603, "learning_rate": 2e-05, "loss": 0.6937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5647, "tokens_per_second_per_gpu": 17366.0, "total_tokens": 557652349 }, { "epoch": 0.353088272068017, "grad_norm": 0.9002678990364075, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5648, "tokens_per_second_per_gpu": 17693.08, "total_tokens": 557748691 }, { "epoch": 0.3531507876969242, "grad_norm": 0.8542921543121338, "learning_rate": 2e-05, "loss": 0.6719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5649, "tokens_per_second_per_gpu": 17849.85, "total_tokens": 557851963 }, { "epoch": 0.35321330332583145, "grad_norm": 0.9041237235069275, "learning_rate": 2e-05, "loss": 0.6747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5650, "tokens_per_second_per_gpu": 17208.72, "total_tokens": 557951213 }, { "epoch": 0.3532758189547387, "grad_norm": 0.9271349310874939, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5651, "tokens_per_second_per_gpu": 18615.21, "total_tokens": 558049906 }, { "epoch": 0.3533383345836459, "grad_norm": 0.8881821632385254, "learning_rate": 2e-05, "loss": 0.7144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5652, "tokens_per_second_per_gpu": 18387.3, "total_tokens": 558154798 }, { "epoch": 0.35340085021255313, "grad_norm": 0.8991655111312866, "learning_rate": 2e-05, "loss": 0.6869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5653, "tokens_per_second_per_gpu": 16950.07, "total_tokens": 558254158 }, { "epoch": 0.3534633658414604, "grad_norm": 0.9018246531486511, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5654, "tokens_per_second_per_gpu": 17035.13, "total_tokens": 558348792 }, { "epoch": 0.35352588147036756, "grad_norm": 0.9102247953414917, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5655, "tokens_per_second_per_gpu": 17618.35, "total_tokens": 558445802 }, { "epoch": 0.3535883970992748, "grad_norm": 0.868933916091919, "learning_rate": 2e-05, "loss": 0.6607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5656, "tokens_per_second_per_gpu": 18312.69, "total_tokens": 558550271 }, { "epoch": 0.35365091272818205, "grad_norm": 0.8995385766029358, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5657, "tokens_per_second_per_gpu": 17406.56, "total_tokens": 558651190 }, { "epoch": 0.3537134283570893, "grad_norm": 0.8956695795059204, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5658, "tokens_per_second_per_gpu": 17882.77, "total_tokens": 558750663 }, { "epoch": 0.3537759439859965, "grad_norm": 0.9232484698295593, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5659, "tokens_per_second_per_gpu": 16450.96, "total_tokens": 558840432 }, { "epoch": 0.35383845961490373, "grad_norm": 0.9515870809555054, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5660, "tokens_per_second_per_gpu": 17450.93, "total_tokens": 558941172 }, { "epoch": 0.353900975243811, "grad_norm": 0.9100998640060425, "learning_rate": 2e-05, "loss": 0.6868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5661, "tokens_per_second_per_gpu": 17754.14, "total_tokens": 559041798 }, { "epoch": 0.35396349087271817, "grad_norm": 0.9255967140197754, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5662, "tokens_per_second_per_gpu": 17508.49, "total_tokens": 559136354 }, { "epoch": 0.3540260065016254, "grad_norm": 0.8909400701522827, "learning_rate": 2e-05, "loss": 0.6791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5663, "tokens_per_second_per_gpu": 16082.93, "total_tokens": 559234005 }, { "epoch": 0.35408852213053266, "grad_norm": 0.9453400373458862, "learning_rate": 2e-05, "loss": 0.6859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5664, "tokens_per_second_per_gpu": 17132.72, "total_tokens": 559332473 }, { "epoch": 0.35415103775943985, "grad_norm": 0.9086171388626099, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5665, "tokens_per_second_per_gpu": 16161.53, "total_tokens": 559427824 }, { "epoch": 0.3542135533883471, "grad_norm": 0.9674777388572693, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5666, "tokens_per_second_per_gpu": 17403.3, "total_tokens": 559525790 }, { "epoch": 0.35427606901725434, "grad_norm": 0.8795177936553955, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5667, "tokens_per_second_per_gpu": 17924.7, "total_tokens": 559626513 }, { "epoch": 0.3543385846461615, "grad_norm": 0.8641547560691833, "learning_rate": 2e-05, "loss": 0.6569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5668, "tokens_per_second_per_gpu": 18455.19, "total_tokens": 559728656 }, { "epoch": 0.35440110027506877, "grad_norm": 0.959300696849823, "learning_rate": 2e-05, "loss": 0.6858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5669, "tokens_per_second_per_gpu": 17014.31, "total_tokens": 559828498 }, { "epoch": 0.354463615903976, "grad_norm": 0.8832741975784302, "learning_rate": 2e-05, "loss": 0.6753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5670, "tokens_per_second_per_gpu": 18247.75, "total_tokens": 559928564 }, { "epoch": 0.3545261315328832, "grad_norm": 0.9418001770973206, "learning_rate": 2e-05, "loss": 0.664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5671, "tokens_per_second_per_gpu": 17706.04, "total_tokens": 560027029 }, { "epoch": 0.35458864716179045, "grad_norm": 0.9051284790039062, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5672, "tokens_per_second_per_gpu": 18189.59, "total_tokens": 560125951 }, { "epoch": 0.3546511627906977, "grad_norm": 0.878295361995697, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5673, "tokens_per_second_per_gpu": 17357.49, "total_tokens": 560226156 }, { "epoch": 0.3547136784196049, "grad_norm": 0.8988738656044006, "learning_rate": 2e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5674, "tokens_per_second_per_gpu": 17322.04, "total_tokens": 560327755 }, { "epoch": 0.35477619404851213, "grad_norm": 0.9313392639160156, "learning_rate": 2e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5675, "tokens_per_second_per_gpu": 17094.1, "total_tokens": 560427596 }, { "epoch": 0.3548387096774194, "grad_norm": 0.9225049614906311, "learning_rate": 2e-05, "loss": 0.6748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5676, "tokens_per_second_per_gpu": 16690.62, "total_tokens": 560523154 }, { "epoch": 0.35490122530632656, "grad_norm": 0.9015263915061951, "learning_rate": 2e-05, "loss": 0.6894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5677, "tokens_per_second_per_gpu": 17933.59, "total_tokens": 560625054 }, { "epoch": 0.3549637409352338, "grad_norm": 0.9599990248680115, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5678, "tokens_per_second_per_gpu": 16208.64, "total_tokens": 560720180 }, { "epoch": 0.35502625656414105, "grad_norm": 0.8806362152099609, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5679, "tokens_per_second_per_gpu": 16927.2, "total_tokens": 560819865 }, { "epoch": 0.35508877219304824, "grad_norm": 0.8704252243041992, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5680, "tokens_per_second_per_gpu": 17798.7, "total_tokens": 560916963 }, { "epoch": 0.3551512878219555, "grad_norm": 0.8996316194534302, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5681, "tokens_per_second_per_gpu": 17914.29, "total_tokens": 561015751 }, { "epoch": 0.35521380345086273, "grad_norm": 0.9121812582015991, "learning_rate": 2e-05, "loss": 0.6772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5682, "tokens_per_second_per_gpu": 17277.12, "total_tokens": 561114418 }, { "epoch": 0.3552763190797699, "grad_norm": 0.89202880859375, "learning_rate": 2e-05, "loss": 0.6394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5683, "tokens_per_second_per_gpu": 17462.94, "total_tokens": 561213885 }, { "epoch": 0.35533883470867716, "grad_norm": 0.9377177953720093, "learning_rate": 2e-05, "loss": 0.6753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5684, "tokens_per_second_per_gpu": 16985.16, "total_tokens": 561315563 }, { "epoch": 0.3554013503375844, "grad_norm": 0.9184455275535583, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5685, "tokens_per_second_per_gpu": 17979.36, "total_tokens": 561414522 }, { "epoch": 0.3554638659664916, "grad_norm": 0.9253261685371399, "learning_rate": 2e-05, "loss": 0.6522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5686, "tokens_per_second_per_gpu": 16129.37, "total_tokens": 561504650 }, { "epoch": 0.35552638159539884, "grad_norm": 0.9011326432228088, "learning_rate": 2e-05, "loss": 0.6457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5687, "tokens_per_second_per_gpu": 17777.23, "total_tokens": 561605609 }, { "epoch": 0.3555888972243061, "grad_norm": 0.8999262452125549, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5688, "tokens_per_second_per_gpu": 16680.12, "total_tokens": 561699441 }, { "epoch": 0.3556514128532133, "grad_norm": 0.9064591526985168, "learning_rate": 2e-05, "loss": 0.7021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5689, "tokens_per_second_per_gpu": 16940.97, "total_tokens": 561797971 }, { "epoch": 0.3557139284821205, "grad_norm": 0.9180439710617065, "learning_rate": 2e-05, "loss": 0.6957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5690, "tokens_per_second_per_gpu": 18016.26, "total_tokens": 561900758 }, { "epoch": 0.35577644411102777, "grad_norm": 0.8941909074783325, "learning_rate": 2e-05, "loss": 0.6406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5691, "tokens_per_second_per_gpu": 16426.17, "total_tokens": 561994450 }, { "epoch": 0.35583895973993496, "grad_norm": 0.8900814056396484, "learning_rate": 2e-05, "loss": 0.6669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5692, "tokens_per_second_per_gpu": 18775.54, "total_tokens": 562095704 }, { "epoch": 0.3559014753688422, "grad_norm": 0.8690280914306641, "learning_rate": 2e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5693, "tokens_per_second_per_gpu": 16815.49, "total_tokens": 562191159 }, { "epoch": 0.35596399099774945, "grad_norm": 0.905949592590332, "learning_rate": 2e-05, "loss": 0.6577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5694, "tokens_per_second_per_gpu": 18008.29, "total_tokens": 562290586 }, { "epoch": 0.3560265066266567, "grad_norm": 0.9276211261749268, "learning_rate": 2e-05, "loss": 0.65, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5695, "tokens_per_second_per_gpu": 17299.96, "total_tokens": 562388155 }, { "epoch": 0.3560890222555639, "grad_norm": 0.9329140782356262, "learning_rate": 2e-05, "loss": 0.6745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5696, "tokens_per_second_per_gpu": 16281.56, "total_tokens": 562483175 }, { "epoch": 0.3561515378844711, "grad_norm": 0.8769370317459106, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5697, "tokens_per_second_per_gpu": 15760.61, "total_tokens": 562578891 }, { "epoch": 0.35621405351337837, "grad_norm": 0.8936084508895874, "learning_rate": 2e-05, "loss": 0.6428, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5698, "tokens_per_second_per_gpu": 17623.73, "total_tokens": 562676646 }, { "epoch": 0.35627656914228556, "grad_norm": 0.9008745551109314, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5699, "tokens_per_second_per_gpu": 17689.39, "total_tokens": 562775851 }, { "epoch": 0.3563390847711928, "grad_norm": 0.9324905872344971, "learning_rate": 2e-05, "loss": 0.6569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5700, "tokens_per_second_per_gpu": 16854.33, "total_tokens": 562872686 }, { "epoch": 0.35640160040010005, "grad_norm": 0.8919503092765808, "learning_rate": 2e-05, "loss": 0.6792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5701, "tokens_per_second_per_gpu": 17413.89, "total_tokens": 562975784 }, { "epoch": 0.35646411602900724, "grad_norm": 0.9223220348358154, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5702, "tokens_per_second_per_gpu": 17413.73, "total_tokens": 563076467 }, { "epoch": 0.3565266316579145, "grad_norm": 0.9058775901794434, "learning_rate": 2e-05, "loss": 0.6698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5703, "tokens_per_second_per_gpu": 17620.13, "total_tokens": 563174027 }, { "epoch": 0.35658914728682173, "grad_norm": 0.9005820751190186, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5704, "tokens_per_second_per_gpu": 17942.44, "total_tokens": 563273967 }, { "epoch": 0.3566516629157289, "grad_norm": 0.9164385199546814, "learning_rate": 2e-05, "loss": 0.6897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5705, "tokens_per_second_per_gpu": 17503.69, "total_tokens": 563371317 }, { "epoch": 0.35671417854463616, "grad_norm": 0.9015069007873535, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5706, "tokens_per_second_per_gpu": 17254.39, "total_tokens": 563471246 }, { "epoch": 0.3567766941735434, "grad_norm": 0.8624268770217896, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5707, "tokens_per_second_per_gpu": 18522.85, "total_tokens": 563571153 }, { "epoch": 0.3568392098024506, "grad_norm": 0.8643755912780762, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5708, "tokens_per_second_per_gpu": 17573.47, "total_tokens": 563670849 }, { "epoch": 0.35690172543135784, "grad_norm": 0.8853487968444824, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5709, "tokens_per_second_per_gpu": 17051.19, "total_tokens": 563768709 }, { "epoch": 0.3569642410602651, "grad_norm": 0.8478058576583862, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5710, "tokens_per_second_per_gpu": 18542.68, "total_tokens": 563873914 }, { "epoch": 0.3570267566891723, "grad_norm": 0.8912604451179504, "learning_rate": 2e-05, "loss": 0.694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5711, "tokens_per_second_per_gpu": 17318.2, "total_tokens": 563970804 }, { "epoch": 0.3570892723180795, "grad_norm": 0.8917189836502075, "learning_rate": 2e-05, "loss": 0.6978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5712, "tokens_per_second_per_gpu": 18703.24, "total_tokens": 564072783 }, { "epoch": 0.35715178794698677, "grad_norm": 0.9285447001457214, "learning_rate": 2e-05, "loss": 0.6895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5713, "tokens_per_second_per_gpu": 17387.5, "total_tokens": 564170823 }, { "epoch": 0.35721430357589395, "grad_norm": 0.9090434908866882, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5714, "tokens_per_second_per_gpu": 16983.37, "total_tokens": 564271700 }, { "epoch": 0.3572768192048012, "grad_norm": 0.8785051703453064, "learning_rate": 2e-05, "loss": 0.6518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5715, "tokens_per_second_per_gpu": 18297.05, "total_tokens": 564373335 }, { "epoch": 0.35733933483370844, "grad_norm": 0.881766676902771, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5716, "tokens_per_second_per_gpu": 17604.98, "total_tokens": 564470055 }, { "epoch": 0.35740185046261563, "grad_norm": 0.8688915967941284, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5717, "tokens_per_second_per_gpu": 17852.98, "total_tokens": 564566945 }, { "epoch": 0.3574643660915229, "grad_norm": 0.8673036694526672, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5718, "tokens_per_second_per_gpu": 15904.16, "total_tokens": 564663951 }, { "epoch": 0.3575268817204301, "grad_norm": 0.9181350469589233, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5719, "tokens_per_second_per_gpu": 17764.23, "total_tokens": 564762096 }, { "epoch": 0.3575893973493373, "grad_norm": 0.8690840005874634, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5720, "tokens_per_second_per_gpu": 18659.15, "total_tokens": 564865167 }, { "epoch": 0.35765191297824456, "grad_norm": 0.9117662310600281, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5721, "tokens_per_second_per_gpu": 16675.77, "total_tokens": 564962369 }, { "epoch": 0.3577144286071518, "grad_norm": 0.9583607316017151, "learning_rate": 2e-05, "loss": 0.6986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5722, "tokens_per_second_per_gpu": 17056.99, "total_tokens": 565057286 }, { "epoch": 0.357776944236059, "grad_norm": 0.8706514835357666, "learning_rate": 2e-05, "loss": 0.6687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5723, "tokens_per_second_per_gpu": 16712.41, "total_tokens": 565156820 }, { "epoch": 0.35783945986496624, "grad_norm": 0.8904245495796204, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5724, "tokens_per_second_per_gpu": 17048.62, "total_tokens": 565255500 }, { "epoch": 0.3579019754938735, "grad_norm": 0.8423575162887573, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5725, "tokens_per_second_per_gpu": 16303.07, "total_tokens": 565351831 }, { "epoch": 0.35796449112278067, "grad_norm": 0.9099768400192261, "learning_rate": 2e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5726, "tokens_per_second_per_gpu": 17162.49, "total_tokens": 565454812 }, { "epoch": 0.3580270067516879, "grad_norm": 0.8493534326553345, "learning_rate": 2e-05, "loss": 0.6704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5727, "tokens_per_second_per_gpu": 18255.91, "total_tokens": 565557159 }, { "epoch": 0.35808952238059516, "grad_norm": 0.898752748966217, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5728, "tokens_per_second_per_gpu": 17486.72, "total_tokens": 565650708 }, { "epoch": 0.35815203800950235, "grad_norm": 0.9008411169052124, "learning_rate": 2e-05, "loss": 0.6983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5729, "tokens_per_second_per_gpu": 16551.39, "total_tokens": 565746443 }, { "epoch": 0.3582145536384096, "grad_norm": 0.906408429145813, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5730, "tokens_per_second_per_gpu": 17069.91, "total_tokens": 565839915 }, { "epoch": 0.35827706926731684, "grad_norm": 0.9090920090675354, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5731, "tokens_per_second_per_gpu": 16877.99, "total_tokens": 565939108 }, { "epoch": 0.3583395848962241, "grad_norm": 0.9059568643569946, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5732, "tokens_per_second_per_gpu": 17881.42, "total_tokens": 566041165 }, { "epoch": 0.3584021005251313, "grad_norm": 0.9131450653076172, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5733, "tokens_per_second_per_gpu": 17397.13, "total_tokens": 566141378 }, { "epoch": 0.3584646161540385, "grad_norm": 0.941517174243927, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5734, "tokens_per_second_per_gpu": 17220.44, "total_tokens": 566237608 }, { "epoch": 0.35852713178294576, "grad_norm": 0.8908786177635193, "learning_rate": 2e-05, "loss": 0.6802, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5735, "tokens_per_second_per_gpu": 18858.91, "total_tokens": 566339019 }, { "epoch": 0.35858964741185295, "grad_norm": 0.8983550071716309, "learning_rate": 2e-05, "loss": 0.6888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5736, "tokens_per_second_per_gpu": 17070.37, "total_tokens": 566436604 }, { "epoch": 0.3586521630407602, "grad_norm": 0.8518334627151489, "learning_rate": 2e-05, "loss": 0.6432, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5737, "tokens_per_second_per_gpu": 17813.73, "total_tokens": 566534081 }, { "epoch": 0.35871467866966744, "grad_norm": 0.9248837828636169, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5738, "tokens_per_second_per_gpu": 17250.77, "total_tokens": 566629998 }, { "epoch": 0.35877719429857463, "grad_norm": 0.8633695244789124, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5739, "tokens_per_second_per_gpu": 17985.35, "total_tokens": 566730526 }, { "epoch": 0.3588397099274819, "grad_norm": 0.8955618739128113, "learning_rate": 2e-05, "loss": 0.6823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5740, "tokens_per_second_per_gpu": 17719.34, "total_tokens": 566829925 }, { "epoch": 0.3589022255563891, "grad_norm": 0.8883893489837646, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5741, "tokens_per_second_per_gpu": 17454.23, "total_tokens": 566931336 }, { "epoch": 0.3589647411852963, "grad_norm": 0.9115043878555298, "learning_rate": 2e-05, "loss": 0.7018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5742, "tokens_per_second_per_gpu": 18858.86, "total_tokens": 567031961 }, { "epoch": 0.35902725681420355, "grad_norm": 0.8595563769340515, "learning_rate": 2e-05, "loss": 0.6479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5743, "tokens_per_second_per_gpu": 17793.6, "total_tokens": 567132085 }, { "epoch": 0.3590897724431108, "grad_norm": 0.8897480964660645, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5744, "tokens_per_second_per_gpu": 16359.19, "total_tokens": 567228099 }, { "epoch": 0.359152288072018, "grad_norm": 0.9567954540252686, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5745, "tokens_per_second_per_gpu": 15297.65, "total_tokens": 567321006 }, { "epoch": 0.35921480370092523, "grad_norm": 0.9320976138114929, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5746, "tokens_per_second_per_gpu": 16559.45, "total_tokens": 567413544 }, { "epoch": 0.3592773193298325, "grad_norm": 0.8785229325294495, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5747, "tokens_per_second_per_gpu": 17690.68, "total_tokens": 567516075 }, { "epoch": 0.35933983495873967, "grad_norm": 0.8509528636932373, "learning_rate": 2e-05, "loss": 0.6247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5748, "tokens_per_second_per_gpu": 17697.53, "total_tokens": 567618070 }, { "epoch": 0.3594023505876469, "grad_norm": 0.9435656070709229, "learning_rate": 2e-05, "loss": 0.6534, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5749, "tokens_per_second_per_gpu": 16538.69, "total_tokens": 567714708 }, { "epoch": 0.35946486621655416, "grad_norm": 0.8895366787910461, "learning_rate": 2e-05, "loss": 0.6682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5750, "tokens_per_second_per_gpu": 17133.71, "total_tokens": 567813962 }, { "epoch": 0.35952738184546135, "grad_norm": 0.8862075805664062, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5751, "tokens_per_second_per_gpu": 18067.54, "total_tokens": 567912764 }, { "epoch": 0.3595898974743686, "grad_norm": 0.8662759065628052, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5752, "tokens_per_second_per_gpu": 18014.3, "total_tokens": 568013490 }, { "epoch": 0.35965241310327584, "grad_norm": 0.9170519113540649, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5753, "tokens_per_second_per_gpu": 16243.41, "total_tokens": 568110805 }, { "epoch": 0.359714928732183, "grad_norm": 0.8940345048904419, "learning_rate": 2e-05, "loss": 0.6803, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5754, "tokens_per_second_per_gpu": 18554.07, "total_tokens": 568214649 }, { "epoch": 0.35977744436109027, "grad_norm": 0.8833901286125183, "learning_rate": 2e-05, "loss": 0.6958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5755, "tokens_per_second_per_gpu": 18105.42, "total_tokens": 568318609 }, { "epoch": 0.3598399599899975, "grad_norm": 0.8741684556007385, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5756, "tokens_per_second_per_gpu": 16737.38, "total_tokens": 568415907 }, { "epoch": 0.3599024756189047, "grad_norm": 0.8756778240203857, "learning_rate": 2e-05, "loss": 0.6761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5757, "tokens_per_second_per_gpu": 17787.06, "total_tokens": 568516716 }, { "epoch": 0.35996499124781195, "grad_norm": 0.9445482492446899, "learning_rate": 2e-05, "loss": 0.672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5758, "tokens_per_second_per_gpu": 16228.82, "total_tokens": 568611983 }, { "epoch": 0.3600275068767192, "grad_norm": 0.8898860812187195, "learning_rate": 2e-05, "loss": 0.673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5759, "tokens_per_second_per_gpu": 16399.99, "total_tokens": 568709846 }, { "epoch": 0.3600900225056264, "grad_norm": 0.9326948523521423, "learning_rate": 2e-05, "loss": 0.6841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5760, "tokens_per_second_per_gpu": 17541.34, "total_tokens": 568808405 }, { "epoch": 0.36015253813453363, "grad_norm": 0.8912964463233948, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5761, "tokens_per_second_per_gpu": 16197.3, "total_tokens": 568904236 }, { "epoch": 0.3602150537634409, "grad_norm": 0.9112641215324402, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5762, "tokens_per_second_per_gpu": 17362.17, "total_tokens": 569002930 }, { "epoch": 0.36027756939234806, "grad_norm": 0.865368127822876, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5763, "tokens_per_second_per_gpu": 17282.06, "total_tokens": 569103445 }, { "epoch": 0.3603400850212553, "grad_norm": 0.8716859221458435, "learning_rate": 2e-05, "loss": 0.6613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5764, "tokens_per_second_per_gpu": 17515.52, "total_tokens": 569206145 }, { "epoch": 0.36040260065016255, "grad_norm": 0.8632294535636902, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5765, "tokens_per_second_per_gpu": 17901.64, "total_tokens": 569305746 }, { "epoch": 0.36046511627906974, "grad_norm": 0.9143739342689514, "learning_rate": 2e-05, "loss": 0.6869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5766, "tokens_per_second_per_gpu": 17462.46, "total_tokens": 569407780 }, { "epoch": 0.360527631907977, "grad_norm": 0.8742941617965698, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5767, "tokens_per_second_per_gpu": 18144.09, "total_tokens": 569506590 }, { "epoch": 0.36059014753688423, "grad_norm": 0.87677001953125, "learning_rate": 2e-05, "loss": 0.6541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5768, "tokens_per_second_per_gpu": 17559.72, "total_tokens": 569603290 }, { "epoch": 0.3606526631657914, "grad_norm": 0.8862701058387756, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5769, "tokens_per_second_per_gpu": 18570.46, "total_tokens": 569708584 }, { "epoch": 0.36071517879469867, "grad_norm": 0.9589887857437134, "learning_rate": 2e-05, "loss": 0.6905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5770, "tokens_per_second_per_gpu": 15419.81, "total_tokens": 569801488 }, { "epoch": 0.3607776944236059, "grad_norm": 0.8524960875511169, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5771, "tokens_per_second_per_gpu": 16861.08, "total_tokens": 569901285 }, { "epoch": 0.36084021005251316, "grad_norm": 0.9309899210929871, "learning_rate": 2e-05, "loss": 0.6651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5772, "tokens_per_second_per_gpu": 17173.11, "total_tokens": 569997822 }, { "epoch": 0.36090272568142034, "grad_norm": 0.8948540687561035, "learning_rate": 2e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5773, "tokens_per_second_per_gpu": 16485.62, "total_tokens": 570094942 }, { "epoch": 0.3609652413103276, "grad_norm": 0.8744438886642456, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5774, "tokens_per_second_per_gpu": 17542.94, "total_tokens": 570197016 }, { "epoch": 0.36102775693923483, "grad_norm": 0.9244672656059265, "learning_rate": 2e-05, "loss": 0.7082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5775, "tokens_per_second_per_gpu": 17156.29, "total_tokens": 570295749 }, { "epoch": 0.361090272568142, "grad_norm": 0.9499097466468811, "learning_rate": 2e-05, "loss": 0.6643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5776, "tokens_per_second_per_gpu": 16672.05, "total_tokens": 570389974 }, { "epoch": 0.36115278819704927, "grad_norm": 0.8520172834396362, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5777, "tokens_per_second_per_gpu": 15314.05, "total_tokens": 570486891 }, { "epoch": 0.3612153038259565, "grad_norm": 0.9553311467170715, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5778, "tokens_per_second_per_gpu": 17836.26, "total_tokens": 570584901 }, { "epoch": 0.3612778194548637, "grad_norm": 0.897910475730896, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5779, "tokens_per_second_per_gpu": 16955.9, "total_tokens": 570681778 }, { "epoch": 0.36134033508377095, "grad_norm": 0.9134795665740967, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5780, "tokens_per_second_per_gpu": 17601.4, "total_tokens": 570780073 }, { "epoch": 0.3614028507126782, "grad_norm": 0.8989028930664062, "learning_rate": 2e-05, "loss": 0.7004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5781, "tokens_per_second_per_gpu": 18033.2, "total_tokens": 570883643 }, { "epoch": 0.3614653663415854, "grad_norm": 0.8783477544784546, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5782, "tokens_per_second_per_gpu": 17457.8, "total_tokens": 570982544 }, { "epoch": 0.3615278819704926, "grad_norm": 0.9009310007095337, "learning_rate": 2e-05, "loss": 0.6672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5783, "tokens_per_second_per_gpu": 17899.34, "total_tokens": 571081509 }, { "epoch": 0.36159039759939987, "grad_norm": 0.9123007655143738, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5784, "tokens_per_second_per_gpu": 16833.55, "total_tokens": 571177324 }, { "epoch": 0.36165291322830706, "grad_norm": 0.9047073721885681, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5785, "tokens_per_second_per_gpu": 17145.15, "total_tokens": 571274794 }, { "epoch": 0.3617154288572143, "grad_norm": 0.8844897150993347, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5786, "tokens_per_second_per_gpu": 17964.54, "total_tokens": 571375691 }, { "epoch": 0.36177794448612155, "grad_norm": 0.8682690262794495, "learning_rate": 2e-05, "loss": 0.6752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5787, "tokens_per_second_per_gpu": 17340.81, "total_tokens": 571476921 }, { "epoch": 0.36184046011502874, "grad_norm": 0.8852866888046265, "learning_rate": 2e-05, "loss": 0.6672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5788, "tokens_per_second_per_gpu": 16480.24, "total_tokens": 571573543 }, { "epoch": 0.361902975743936, "grad_norm": 0.8752979040145874, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5789, "tokens_per_second_per_gpu": 16723.31, "total_tokens": 571673670 }, { "epoch": 0.36196549137284323, "grad_norm": 0.8647202849388123, "learning_rate": 2e-05, "loss": 0.6901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5790, "tokens_per_second_per_gpu": 17631.8, "total_tokens": 571776105 }, { "epoch": 0.3620280070017504, "grad_norm": 0.8677529096603394, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5791, "tokens_per_second_per_gpu": 17882.85, "total_tokens": 571877824 }, { "epoch": 0.36209052263065766, "grad_norm": 0.8932716250419617, "learning_rate": 2e-05, "loss": 0.689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5792, "tokens_per_second_per_gpu": 17914.36, "total_tokens": 571975814 }, { "epoch": 0.3621530382595649, "grad_norm": 0.8962432146072388, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5793, "tokens_per_second_per_gpu": 17986.44, "total_tokens": 572077252 }, { "epoch": 0.3622155538884721, "grad_norm": 0.903918981552124, "learning_rate": 2e-05, "loss": 0.6709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5794, "tokens_per_second_per_gpu": 16017.43, "total_tokens": 572175735 }, { "epoch": 0.36227806951737934, "grad_norm": 0.8609280586242676, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5795, "tokens_per_second_per_gpu": 17434.43, "total_tokens": 572275926 }, { "epoch": 0.3623405851462866, "grad_norm": 0.9068530201911926, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5796, "tokens_per_second_per_gpu": 15774.8, "total_tokens": 572367182 }, { "epoch": 0.3624031007751938, "grad_norm": 0.9282803535461426, "learning_rate": 2e-05, "loss": 0.6663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5797, "tokens_per_second_per_gpu": 16529.14, "total_tokens": 572466628 }, { "epoch": 0.362465616404101, "grad_norm": 0.9056474566459656, "learning_rate": 2e-05, "loss": 0.6569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5798, "tokens_per_second_per_gpu": 15932.01, "total_tokens": 572560799 }, { "epoch": 0.36252813203300827, "grad_norm": 0.8866223096847534, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5799, "tokens_per_second_per_gpu": 17384.88, "total_tokens": 572659987 }, { "epoch": 0.36259064766191546, "grad_norm": 0.8901169300079346, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5800, "tokens_per_second_per_gpu": 18444.91, "total_tokens": 572759364 }, { "epoch": 0.3626531632908227, "grad_norm": 0.880012571811676, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5801, "tokens_per_second_per_gpu": 17825.35, "total_tokens": 572859707 }, { "epoch": 0.36271567891972994, "grad_norm": 0.9147337675094604, "learning_rate": 2e-05, "loss": 0.6823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5802, "tokens_per_second_per_gpu": 18052.64, "total_tokens": 572960626 }, { "epoch": 0.36277819454863713, "grad_norm": 0.8942899703979492, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5803, "tokens_per_second_per_gpu": 16438.64, "total_tokens": 573056670 }, { "epoch": 0.3628407101775444, "grad_norm": 0.9657424092292786, "learning_rate": 2e-05, "loss": 0.7391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5804, "tokens_per_second_per_gpu": 17401.03, "total_tokens": 573157295 }, { "epoch": 0.3629032258064516, "grad_norm": 0.9009554982185364, "learning_rate": 2e-05, "loss": 0.6832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5805, "tokens_per_second_per_gpu": 16950.41, "total_tokens": 573255271 }, { "epoch": 0.3629657414353588, "grad_norm": 0.8920392394065857, "learning_rate": 2e-05, "loss": 0.6791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5806, "tokens_per_second_per_gpu": 18248.05, "total_tokens": 573358407 }, { "epoch": 0.36302825706426606, "grad_norm": 0.9295347332954407, "learning_rate": 2e-05, "loss": 0.6728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5807, "tokens_per_second_per_gpu": 17730.97, "total_tokens": 573456891 }, { "epoch": 0.3630907726931733, "grad_norm": 0.8781394362449646, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5808, "tokens_per_second_per_gpu": 16639.97, "total_tokens": 573553370 }, { "epoch": 0.36315328832208055, "grad_norm": 0.8782497644424438, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5809, "tokens_per_second_per_gpu": 16043.34, "total_tokens": 573649887 }, { "epoch": 0.36321580395098774, "grad_norm": 0.8880986571311951, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5810, "tokens_per_second_per_gpu": 16956.53, "total_tokens": 573750253 }, { "epoch": 0.363278319579895, "grad_norm": 0.8722561597824097, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5811, "tokens_per_second_per_gpu": 16868.34, "total_tokens": 573848318 }, { "epoch": 0.3633408352088022, "grad_norm": 0.903502345085144, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5812, "tokens_per_second_per_gpu": 15978.51, "total_tokens": 573943648 }, { "epoch": 0.3634033508377094, "grad_norm": 0.9476007223129272, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5813, "tokens_per_second_per_gpu": 18548.36, "total_tokens": 574045159 }, { "epoch": 0.36346586646661666, "grad_norm": 0.9362157583236694, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5814, "tokens_per_second_per_gpu": 16020.5, "total_tokens": 574143212 }, { "epoch": 0.3635283820955239, "grad_norm": 0.9079171419143677, "learning_rate": 2e-05, "loss": 0.6486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5815, "tokens_per_second_per_gpu": 16692.67, "total_tokens": 574241925 }, { "epoch": 0.3635908977244311, "grad_norm": 0.8925111889839172, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5816, "tokens_per_second_per_gpu": 16987.77, "total_tokens": 574340318 }, { "epoch": 0.36365341335333834, "grad_norm": 0.8901721835136414, "learning_rate": 2e-05, "loss": 0.6593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5817, "tokens_per_second_per_gpu": 17998.35, "total_tokens": 574440655 }, { "epoch": 0.3637159289822456, "grad_norm": 0.9636625051498413, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5818, "tokens_per_second_per_gpu": 17653.13, "total_tokens": 574536021 }, { "epoch": 0.3637784446111528, "grad_norm": 0.8734174370765686, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5819, "tokens_per_second_per_gpu": 17010.12, "total_tokens": 574634793 }, { "epoch": 0.36384096024006, "grad_norm": 0.9331033825874329, "learning_rate": 2e-05, "loss": 0.6552, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5820, "tokens_per_second_per_gpu": 17591.3, "total_tokens": 574735034 }, { "epoch": 0.36390347586896726, "grad_norm": 0.9059635400772095, "learning_rate": 2e-05, "loss": 0.6913, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5821, "tokens_per_second_per_gpu": 16935.09, "total_tokens": 574833572 }, { "epoch": 0.36396599149787445, "grad_norm": 0.9422125816345215, "learning_rate": 2e-05, "loss": 0.6781, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5822, "tokens_per_second_per_gpu": 17125.06, "total_tokens": 574931416 }, { "epoch": 0.3640285071267817, "grad_norm": 0.8808863759040833, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5823, "tokens_per_second_per_gpu": 17007.03, "total_tokens": 575027877 }, { "epoch": 0.36409102275568894, "grad_norm": 0.9095982313156128, "learning_rate": 2e-05, "loss": 0.6822, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5824, "tokens_per_second_per_gpu": 18330.01, "total_tokens": 575130244 }, { "epoch": 0.36415353838459613, "grad_norm": 0.8724250197410583, "learning_rate": 2e-05, "loss": 0.6612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5825, "tokens_per_second_per_gpu": 18055.5, "total_tokens": 575233355 }, { "epoch": 0.3642160540135034, "grad_norm": 0.8971463441848755, "learning_rate": 2e-05, "loss": 0.7278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5826, "tokens_per_second_per_gpu": 17674.94, "total_tokens": 575332318 }, { "epoch": 0.3642785696424106, "grad_norm": 0.9200931191444397, "learning_rate": 2e-05, "loss": 0.6722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5827, "tokens_per_second_per_gpu": 17907.17, "total_tokens": 575432000 }, { "epoch": 0.3643410852713178, "grad_norm": 0.9439506530761719, "learning_rate": 2e-05, "loss": 0.7111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5828, "tokens_per_second_per_gpu": 17868.27, "total_tokens": 575529872 }, { "epoch": 0.36440360090022506, "grad_norm": 0.9186313152313232, "learning_rate": 2e-05, "loss": 0.6405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5829, "tokens_per_second_per_gpu": 16724.83, "total_tokens": 575626057 }, { "epoch": 0.3644661165291323, "grad_norm": 0.8855050802230835, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5830, "tokens_per_second_per_gpu": 17308.89, "total_tokens": 575723502 }, { "epoch": 0.3645286321580395, "grad_norm": 0.9064972996711731, "learning_rate": 2e-05, "loss": 0.6659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5831, "tokens_per_second_per_gpu": 17309.34, "total_tokens": 575822552 }, { "epoch": 0.36459114778694673, "grad_norm": 0.8534301519393921, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5832, "tokens_per_second_per_gpu": 15901.16, "total_tokens": 575920672 }, { "epoch": 0.364653663415854, "grad_norm": 0.8995980620384216, "learning_rate": 2e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5833, "tokens_per_second_per_gpu": 17860.69, "total_tokens": 576023722 }, { "epoch": 0.36471617904476117, "grad_norm": 0.9189600944519043, "learning_rate": 2e-05, "loss": 0.6805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5834, "tokens_per_second_per_gpu": 18002.9, "total_tokens": 576123199 }, { "epoch": 0.3647786946736684, "grad_norm": 0.8957812786102295, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5835, "tokens_per_second_per_gpu": 17842.35, "total_tokens": 576220420 }, { "epoch": 0.36484121030257566, "grad_norm": 0.8509117364883423, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5836, "tokens_per_second_per_gpu": 17519.68, "total_tokens": 576321675 }, { "epoch": 0.36490372593148285, "grad_norm": 0.8819394707679749, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5837, "tokens_per_second_per_gpu": 16293.92, "total_tokens": 576416743 }, { "epoch": 0.3649662415603901, "grad_norm": 0.8677046895027161, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5838, "tokens_per_second_per_gpu": 16457.51, "total_tokens": 576513805 }, { "epoch": 0.36502875718929734, "grad_norm": 0.8956710696220398, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5839, "tokens_per_second_per_gpu": 17077.56, "total_tokens": 576612113 }, { "epoch": 0.3650912728182045, "grad_norm": 0.8785106539726257, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5840, "tokens_per_second_per_gpu": 17276.75, "total_tokens": 576710850 }, { "epoch": 0.36515378844711177, "grad_norm": 0.9091425538063049, "learning_rate": 2e-05, "loss": 0.6928, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5841, "tokens_per_second_per_gpu": 17265.34, "total_tokens": 576810543 }, { "epoch": 0.365216304076019, "grad_norm": 0.9060136675834656, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5842, "tokens_per_second_per_gpu": 18162.03, "total_tokens": 576913345 }, { "epoch": 0.3652788197049262, "grad_norm": 0.883476197719574, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5843, "tokens_per_second_per_gpu": 17043.46, "total_tokens": 577012010 }, { "epoch": 0.36534133533383345, "grad_norm": 0.9089934229850769, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5844, "tokens_per_second_per_gpu": 17979.08, "total_tokens": 577113815 }, { "epoch": 0.3654038509627407, "grad_norm": 0.9766348600387573, "learning_rate": 2e-05, "loss": 0.7354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5845, "tokens_per_second_per_gpu": 17667.73, "total_tokens": 577212064 }, { "epoch": 0.3654663665916479, "grad_norm": 0.9445174336433411, "learning_rate": 2e-05, "loss": 0.7102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5846, "tokens_per_second_per_gpu": 18604.4, "total_tokens": 577313883 }, { "epoch": 0.36552888222055513, "grad_norm": 0.8846721053123474, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5847, "tokens_per_second_per_gpu": 17342.81, "total_tokens": 577415264 }, { "epoch": 0.3655913978494624, "grad_norm": 0.9172195196151733, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5848, "tokens_per_second_per_gpu": 17209.36, "total_tokens": 577512264 }, { "epoch": 0.3656539134783696, "grad_norm": 0.9005303382873535, "learning_rate": 2e-05, "loss": 0.6721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5849, "tokens_per_second_per_gpu": 16817.52, "total_tokens": 577606284 }, { "epoch": 0.3657164291072768, "grad_norm": 0.9252036213874817, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5850, "tokens_per_second_per_gpu": 16926.02, "total_tokens": 577703252 }, { "epoch": 0.36577894473618405, "grad_norm": 0.9067889451980591, "learning_rate": 2e-05, "loss": 0.6655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5851, "tokens_per_second_per_gpu": 17170.51, "total_tokens": 577799914 }, { "epoch": 0.3658414603650913, "grad_norm": 0.9133926630020142, "learning_rate": 2e-05, "loss": 0.6413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5852, "tokens_per_second_per_gpu": 16168.35, "total_tokens": 577892963 }, { "epoch": 0.3659039759939985, "grad_norm": 0.8772553205490112, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5853, "tokens_per_second_per_gpu": 16528.96, "total_tokens": 577991016 }, { "epoch": 0.36596649162290573, "grad_norm": 0.9264699220657349, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5854, "tokens_per_second_per_gpu": 16890.59, "total_tokens": 578085046 }, { "epoch": 0.366029007251813, "grad_norm": 0.8797459602355957, "learning_rate": 2e-05, "loss": 0.6926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5855, "tokens_per_second_per_gpu": 17056.11, "total_tokens": 578182662 }, { "epoch": 0.36609152288072017, "grad_norm": 0.8705192804336548, "learning_rate": 2e-05, "loss": 0.6576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5856, "tokens_per_second_per_gpu": 17329.57, "total_tokens": 578280884 }, { "epoch": 0.3661540385096274, "grad_norm": 0.9024161100387573, "learning_rate": 2e-05, "loss": 0.6468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5857, "tokens_per_second_per_gpu": 16391.0, "total_tokens": 578376167 }, { "epoch": 0.36621655413853466, "grad_norm": 0.899243175983429, "learning_rate": 2e-05, "loss": 0.7216, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5858, "tokens_per_second_per_gpu": 18107.55, "total_tokens": 578476151 }, { "epoch": 0.36627906976744184, "grad_norm": 0.9190258383750916, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5859, "tokens_per_second_per_gpu": 16875.32, "total_tokens": 578572401 }, { "epoch": 0.3663415853963491, "grad_norm": 0.9063765406608582, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5860, "tokens_per_second_per_gpu": 17754.72, "total_tokens": 578670905 }, { "epoch": 0.36640410102525633, "grad_norm": 0.9060600399971008, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5861, "tokens_per_second_per_gpu": 18400.43, "total_tokens": 578771011 }, { "epoch": 0.3664666166541635, "grad_norm": 0.8965702056884766, "learning_rate": 2e-05, "loss": 0.7232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5862, "tokens_per_second_per_gpu": 18163.82, "total_tokens": 578874298 }, { "epoch": 0.36652913228307077, "grad_norm": 0.9057731032371521, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5863, "tokens_per_second_per_gpu": 17176.62, "total_tokens": 578972649 }, { "epoch": 0.366591647911978, "grad_norm": 0.8830036520957947, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5864, "tokens_per_second_per_gpu": 17826.9, "total_tokens": 579069900 }, { "epoch": 0.3666541635408852, "grad_norm": 0.9495677351951599, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5865, "tokens_per_second_per_gpu": 17947.99, "total_tokens": 579171110 }, { "epoch": 0.36671667916979245, "grad_norm": 0.8716365694999695, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5866, "tokens_per_second_per_gpu": 18125.22, "total_tokens": 579271553 }, { "epoch": 0.3667791947986997, "grad_norm": 0.8804631233215332, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5867, "tokens_per_second_per_gpu": 17761.53, "total_tokens": 579371280 }, { "epoch": 0.3668417104276069, "grad_norm": 0.9187114834785461, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5868, "tokens_per_second_per_gpu": 17361.28, "total_tokens": 579471118 }, { "epoch": 0.3669042260565141, "grad_norm": 0.8738965392112732, "learning_rate": 2e-05, "loss": 0.7065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5869, "tokens_per_second_per_gpu": 18203.63, "total_tokens": 579573431 }, { "epoch": 0.36696674168542137, "grad_norm": 0.8577467203140259, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5870, "tokens_per_second_per_gpu": 17543.05, "total_tokens": 579670339 }, { "epoch": 0.36702925731432856, "grad_norm": 0.8935378789901733, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5871, "tokens_per_second_per_gpu": 17437.61, "total_tokens": 579770431 }, { "epoch": 0.3670917729432358, "grad_norm": 0.912763774394989, "learning_rate": 2e-05, "loss": 0.674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5872, "tokens_per_second_per_gpu": 16648.95, "total_tokens": 579868197 }, { "epoch": 0.36715428857214305, "grad_norm": 0.8901075720787048, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5873, "tokens_per_second_per_gpu": 18072.78, "total_tokens": 579966208 }, { "epoch": 0.36721680420105024, "grad_norm": 0.8633251786231995, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5874, "tokens_per_second_per_gpu": 17523.15, "total_tokens": 580064325 }, { "epoch": 0.3672793198299575, "grad_norm": 0.8994544744491577, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5875, "tokens_per_second_per_gpu": 17458.16, "total_tokens": 580164902 }, { "epoch": 0.36734183545886473, "grad_norm": 0.9235512614250183, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5876, "tokens_per_second_per_gpu": 16785.49, "total_tokens": 580260823 }, { "epoch": 0.3674043510877719, "grad_norm": 0.8871520757675171, "learning_rate": 2e-05, "loss": 0.6974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5877, "tokens_per_second_per_gpu": 17681.72, "total_tokens": 580361683 }, { "epoch": 0.36746686671667916, "grad_norm": 0.8961672186851501, "learning_rate": 2e-05, "loss": 0.611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5878, "tokens_per_second_per_gpu": 16724.55, "total_tokens": 580457844 }, { "epoch": 0.3675293823455864, "grad_norm": 0.9068059325218201, "learning_rate": 2e-05, "loss": 0.6944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5879, "tokens_per_second_per_gpu": 18302.77, "total_tokens": 580560652 }, { "epoch": 0.3675918979744936, "grad_norm": 0.903887927532196, "learning_rate": 2e-05, "loss": 0.709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5880, "tokens_per_second_per_gpu": 16973.76, "total_tokens": 580662079 }, { "epoch": 0.36765441360340084, "grad_norm": 0.9371901750564575, "learning_rate": 2e-05, "loss": 0.6552, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5881, "tokens_per_second_per_gpu": 17505.83, "total_tokens": 580765031 }, { "epoch": 0.3677169292323081, "grad_norm": 0.8936437368392944, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5882, "tokens_per_second_per_gpu": 17034.13, "total_tokens": 580866547 }, { "epoch": 0.3677794448612153, "grad_norm": 0.8692519068717957, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5883, "tokens_per_second_per_gpu": 17880.81, "total_tokens": 580967635 }, { "epoch": 0.3678419604901225, "grad_norm": 0.9044257402420044, "learning_rate": 2e-05, "loss": 0.6887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5884, "tokens_per_second_per_gpu": 18062.86, "total_tokens": 581067798 }, { "epoch": 0.36790447611902977, "grad_norm": 0.8882101774215698, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5885, "tokens_per_second_per_gpu": 16807.92, "total_tokens": 581164254 }, { "epoch": 0.367966991747937, "grad_norm": 0.8908026218414307, "learning_rate": 2e-05, "loss": 0.7058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5886, "tokens_per_second_per_gpu": 18023.11, "total_tokens": 581266575 }, { "epoch": 0.3680295073768442, "grad_norm": 0.8404673337936401, "learning_rate": 2e-05, "loss": 0.5845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5887, "tokens_per_second_per_gpu": 16008.29, "total_tokens": 581362127 }, { "epoch": 0.36809202300575145, "grad_norm": 0.8663082122802734, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5888, "tokens_per_second_per_gpu": 17393.86, "total_tokens": 581462914 }, { "epoch": 0.3681545386346587, "grad_norm": 0.883434534072876, "learning_rate": 2e-05, "loss": 0.6824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5889, "tokens_per_second_per_gpu": 18092.37, "total_tokens": 581565578 }, { "epoch": 0.3682170542635659, "grad_norm": 0.8910824060440063, "learning_rate": 2e-05, "loss": 0.6761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5890, "tokens_per_second_per_gpu": 16455.17, "total_tokens": 581663067 }, { "epoch": 0.3682795698924731, "grad_norm": 0.8485224843025208, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5891, "tokens_per_second_per_gpu": 18799.12, "total_tokens": 581764731 }, { "epoch": 0.36834208552138037, "grad_norm": 0.8699290752410889, "learning_rate": 2e-05, "loss": 0.6972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5892, "tokens_per_second_per_gpu": 18162.81, "total_tokens": 581867967 }, { "epoch": 0.36840460115028756, "grad_norm": 0.8848530650138855, "learning_rate": 2e-05, "loss": 0.6658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5893, "tokens_per_second_per_gpu": 17846.92, "total_tokens": 581967912 }, { "epoch": 0.3684671167791948, "grad_norm": 0.8823222517967224, "learning_rate": 2e-05, "loss": 0.6723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5894, "tokens_per_second_per_gpu": 17364.26, "total_tokens": 582066559 }, { "epoch": 0.36852963240810205, "grad_norm": 0.8793716430664062, "learning_rate": 2e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5895, "tokens_per_second_per_gpu": 17726.07, "total_tokens": 582170559 }, { "epoch": 0.36859214803700924, "grad_norm": 0.9195157885551453, "learning_rate": 2e-05, "loss": 0.6689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5896, "tokens_per_second_per_gpu": 16894.01, "total_tokens": 582267406 }, { "epoch": 0.3686546636659165, "grad_norm": 0.905224621295929, "learning_rate": 2e-05, "loss": 0.671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5897, "tokens_per_second_per_gpu": 18238.4, "total_tokens": 582367165 }, { "epoch": 0.3687171792948237, "grad_norm": 0.8961045145988464, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5898, "tokens_per_second_per_gpu": 17002.95, "total_tokens": 582464398 }, { "epoch": 0.3687796949237309, "grad_norm": 0.86179119348526, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5899, "tokens_per_second_per_gpu": 17781.18, "total_tokens": 582565435 }, { "epoch": 0.36884221055263816, "grad_norm": 0.9125213027000427, "learning_rate": 2e-05, "loss": 0.6844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5900, "tokens_per_second_per_gpu": 17661.8, "total_tokens": 582665236 }, { "epoch": 0.3689047261815454, "grad_norm": 0.8507044315338135, "learning_rate": 2e-05, "loss": 0.6714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5901, "tokens_per_second_per_gpu": 18264.2, "total_tokens": 582765881 }, { "epoch": 0.3689672418104526, "grad_norm": 0.8883525133132935, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5902, "tokens_per_second_per_gpu": 17086.2, "total_tokens": 582866685 }, { "epoch": 0.36902975743935984, "grad_norm": 0.926533043384552, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5903, "tokens_per_second_per_gpu": 16895.87, "total_tokens": 582962554 }, { "epoch": 0.3690922730682671, "grad_norm": 0.957968533039093, "learning_rate": 2e-05, "loss": 0.6918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5904, "tokens_per_second_per_gpu": 17012.58, "total_tokens": 583058472 }, { "epoch": 0.3691547886971743, "grad_norm": 0.8796818852424622, "learning_rate": 2e-05, "loss": 0.7212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5905, "tokens_per_second_per_gpu": 18034.1, "total_tokens": 583162631 }, { "epoch": 0.3692173043260815, "grad_norm": 0.9072388410568237, "learning_rate": 2e-05, "loss": 0.7073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5906, "tokens_per_second_per_gpu": 17717.4, "total_tokens": 583263638 }, { "epoch": 0.36927981995498876, "grad_norm": 0.9260995388031006, "learning_rate": 2e-05, "loss": 0.6805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5907, "tokens_per_second_per_gpu": 17070.39, "total_tokens": 583360763 }, { "epoch": 0.36934233558389595, "grad_norm": 0.909234344959259, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5908, "tokens_per_second_per_gpu": 15726.01, "total_tokens": 583454359 }, { "epoch": 0.3694048512128032, "grad_norm": 0.9369661808013916, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5909, "tokens_per_second_per_gpu": 16341.97, "total_tokens": 583540930 }, { "epoch": 0.36946736684171044, "grad_norm": 0.8598852753639221, "learning_rate": 2e-05, "loss": 0.6799, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5910, "tokens_per_second_per_gpu": 17268.66, "total_tokens": 583643977 }, { "epoch": 0.36952988247061763, "grad_norm": 0.9046366810798645, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5911, "tokens_per_second_per_gpu": 18136.82, "total_tokens": 583744168 }, { "epoch": 0.3695923980995249, "grad_norm": 0.8784425854682922, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5912, "tokens_per_second_per_gpu": 17254.19, "total_tokens": 583842796 }, { "epoch": 0.3696549137284321, "grad_norm": 0.8586320877075195, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5913, "tokens_per_second_per_gpu": 18367.24, "total_tokens": 583947962 }, { "epoch": 0.3697174293573393, "grad_norm": 0.8860555291175842, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5914, "tokens_per_second_per_gpu": 17160.92, "total_tokens": 584046054 }, { "epoch": 0.36977994498624656, "grad_norm": 0.9361036419868469, "learning_rate": 2e-05, "loss": 0.6831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5915, "tokens_per_second_per_gpu": 15835.57, "total_tokens": 584143126 }, { "epoch": 0.3698424606151538, "grad_norm": 0.9030418395996094, "learning_rate": 2e-05, "loss": 0.67, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5916, "tokens_per_second_per_gpu": 19057.17, "total_tokens": 584247330 }, { "epoch": 0.369904976244061, "grad_norm": 0.9448500871658325, "learning_rate": 2e-05, "loss": 0.6628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5917, "tokens_per_second_per_gpu": 15743.74, "total_tokens": 584334562 }, { "epoch": 0.36996749187296823, "grad_norm": 0.8943078517913818, "learning_rate": 2e-05, "loss": 0.6835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5918, "tokens_per_second_per_gpu": 17992.62, "total_tokens": 584434735 }, { "epoch": 0.3700300075018755, "grad_norm": 0.9050602912902832, "learning_rate": 2e-05, "loss": 0.6708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5919, "tokens_per_second_per_gpu": 17118.27, "total_tokens": 584532745 }, { "epoch": 0.37009252313078267, "grad_norm": 0.8383467793464661, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5920, "tokens_per_second_per_gpu": 19129.07, "total_tokens": 584634767 }, { "epoch": 0.3701550387596899, "grad_norm": 0.8909660577774048, "learning_rate": 2e-05, "loss": 0.6413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5921, "tokens_per_second_per_gpu": 18162.5, "total_tokens": 584735329 }, { "epoch": 0.37021755438859716, "grad_norm": 0.9163928627967834, "learning_rate": 2e-05, "loss": 0.7039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5922, "tokens_per_second_per_gpu": 18212.92, "total_tokens": 584839055 }, { "epoch": 0.3702800700175044, "grad_norm": 0.8580407500267029, "learning_rate": 2e-05, "loss": 0.691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5923, "tokens_per_second_per_gpu": 17470.56, "total_tokens": 584940257 }, { "epoch": 0.3703425856464116, "grad_norm": 0.8724122047424316, "learning_rate": 2e-05, "loss": 0.6718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5924, "tokens_per_second_per_gpu": 17849.15, "total_tokens": 585044366 }, { "epoch": 0.37040510127531884, "grad_norm": 0.8588433861732483, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5925, "tokens_per_second_per_gpu": 16898.32, "total_tokens": 585144220 }, { "epoch": 0.3704676169042261, "grad_norm": 0.9120364785194397, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5926, "tokens_per_second_per_gpu": 17667.81, "total_tokens": 585242808 }, { "epoch": 0.37053013253313327, "grad_norm": 0.9291548132896423, "learning_rate": 2e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5927, "tokens_per_second_per_gpu": 18542.58, "total_tokens": 585343416 }, { "epoch": 0.3705926481620405, "grad_norm": 0.8932057023048401, "learning_rate": 2e-05, "loss": 0.677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5928, "tokens_per_second_per_gpu": 17564.41, "total_tokens": 585445219 }, { "epoch": 0.37065516379094776, "grad_norm": 0.8858460187911987, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5929, "tokens_per_second_per_gpu": 16159.58, "total_tokens": 585541856 }, { "epoch": 0.37071767941985495, "grad_norm": 0.8972607851028442, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5930, "tokens_per_second_per_gpu": 18244.41, "total_tokens": 585640148 }, { "epoch": 0.3707801950487622, "grad_norm": 0.8527999520301819, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5931, "tokens_per_second_per_gpu": 17319.4, "total_tokens": 585738620 }, { "epoch": 0.37084271067766944, "grad_norm": 0.8706446290016174, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5932, "tokens_per_second_per_gpu": 18467.42, "total_tokens": 585838331 }, { "epoch": 0.37090522630657663, "grad_norm": 0.9039784669876099, "learning_rate": 2e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5933, "tokens_per_second_per_gpu": 18308.82, "total_tokens": 585938362 }, { "epoch": 0.3709677419354839, "grad_norm": 0.8831605315208435, "learning_rate": 2e-05, "loss": 0.6734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5934, "tokens_per_second_per_gpu": 17753.47, "total_tokens": 586041173 }, { "epoch": 0.3710302575643911, "grad_norm": 0.9085513353347778, "learning_rate": 2e-05, "loss": 0.6634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5935, "tokens_per_second_per_gpu": 17628.75, "total_tokens": 586141762 }, { "epoch": 0.3710927731932983, "grad_norm": 0.9096230268478394, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5936, "tokens_per_second_per_gpu": 16838.67, "total_tokens": 586238162 }, { "epoch": 0.37115528882220555, "grad_norm": 0.8840309977531433, "learning_rate": 2e-05, "loss": 0.6659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5937, "tokens_per_second_per_gpu": 16388.82, "total_tokens": 586337622 }, { "epoch": 0.3712178044511128, "grad_norm": 0.8606729507446289, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5938, "tokens_per_second_per_gpu": 14206.08, "total_tokens": 586435442 }, { "epoch": 0.37128032008002, "grad_norm": 0.8692227602005005, "learning_rate": 2e-05, "loss": 0.6628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5939, "tokens_per_second_per_gpu": 16579.17, "total_tokens": 586535502 }, { "epoch": 0.37134283570892723, "grad_norm": 0.9128494262695312, "learning_rate": 2e-05, "loss": 0.6753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5940, "tokens_per_second_per_gpu": 16174.74, "total_tokens": 586636744 }, { "epoch": 0.3714053513378345, "grad_norm": 0.9200928211212158, "learning_rate": 2e-05, "loss": 0.6761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5941, "tokens_per_second_per_gpu": 18554.15, "total_tokens": 586736672 }, { "epoch": 0.37146786696674167, "grad_norm": 0.9572598338127136, "learning_rate": 2e-05, "loss": 0.6698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5942, "tokens_per_second_per_gpu": 16708.92, "total_tokens": 586838284 }, { "epoch": 0.3715303825956489, "grad_norm": 0.9766356348991394, "learning_rate": 2e-05, "loss": 0.6554, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5943, "tokens_per_second_per_gpu": 15404.29, "total_tokens": 586937620 }, { "epoch": 0.37159289822455616, "grad_norm": 0.9133448600769043, "learning_rate": 2e-05, "loss": 0.6994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5944, "tokens_per_second_per_gpu": 11100.55, "total_tokens": 587039633 }, { "epoch": 0.37165541385346335, "grad_norm": 0.8821882605552673, "learning_rate": 2e-05, "loss": 0.668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5945, "tokens_per_second_per_gpu": 10568.93, "total_tokens": 587139250 }, { "epoch": 0.3717179294823706, "grad_norm": 0.8611649870872498, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5946, "tokens_per_second_per_gpu": 10855.54, "total_tokens": 587240523 }, { "epoch": 0.37178044511127784, "grad_norm": 0.894584596157074, "learning_rate": 2e-05, "loss": 0.6696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5947, "tokens_per_second_per_gpu": 10175.87, "total_tokens": 587337601 }, { "epoch": 0.371842960740185, "grad_norm": 0.8869580626487732, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5948, "tokens_per_second_per_gpu": 11128.79, "total_tokens": 587441288 }, { "epoch": 0.37190547636909227, "grad_norm": 0.889805018901825, "learning_rate": 2e-05, "loss": 0.6468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5949, "tokens_per_second_per_gpu": 10123.56, "total_tokens": 587536089 }, { "epoch": 0.3719679919979995, "grad_norm": 0.9076393246650696, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5950, "tokens_per_second_per_gpu": 10476.16, "total_tokens": 587634845 }, { "epoch": 0.3720305076269067, "grad_norm": 0.8806503415107727, "learning_rate": 2e-05, "loss": 0.6782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5951, "tokens_per_second_per_gpu": 10583.9, "total_tokens": 587734818 }, { "epoch": 0.37209302325581395, "grad_norm": 0.9038432836532593, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5952, "tokens_per_second_per_gpu": 10201.22, "total_tokens": 587830522 }, { "epoch": 0.3721555388847212, "grad_norm": 0.8930050730705261, "learning_rate": 2e-05, "loss": 0.705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5953, "tokens_per_second_per_gpu": 10707.94, "total_tokens": 587931185 }, { "epoch": 0.3722180545136284, "grad_norm": 0.9051708579063416, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5954, "tokens_per_second_per_gpu": 10538.18, "total_tokens": 588030172 }, { "epoch": 0.3722805701425356, "grad_norm": 0.8954328894615173, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5955, "tokens_per_second_per_gpu": 11113.24, "total_tokens": 588130860 }, { "epoch": 0.37234308577144287, "grad_norm": 0.9118967056274414, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5956, "tokens_per_second_per_gpu": 10364.29, "total_tokens": 588227935 }, { "epoch": 0.37240560140035006, "grad_norm": 0.8672183156013489, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5957, "tokens_per_second_per_gpu": 11200.74, "total_tokens": 588322283 }, { "epoch": 0.3724681170292573, "grad_norm": 0.9177748560905457, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5958, "tokens_per_second_per_gpu": 9880.31, "total_tokens": 588421642 }, { "epoch": 0.37253063265816455, "grad_norm": 0.917526125907898, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5959, "tokens_per_second_per_gpu": 10415.23, "total_tokens": 588514506 }, { "epoch": 0.37259314828707174, "grad_norm": 0.87420654296875, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5960, "tokens_per_second_per_gpu": 9959.16, "total_tokens": 588614757 }, { "epoch": 0.372655663915979, "grad_norm": 0.8690875172615051, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5961, "tokens_per_second_per_gpu": 10538.7, "total_tokens": 588712181 }, { "epoch": 0.37271817954488623, "grad_norm": 0.9186732769012451, "learning_rate": 2e-05, "loss": 0.7222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5962, "tokens_per_second_per_gpu": 11067.38, "total_tokens": 588812433 }, { "epoch": 0.3727806951737935, "grad_norm": 0.8400458097457886, "learning_rate": 2e-05, "loss": 0.606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5963, "tokens_per_second_per_gpu": 11231.39, "total_tokens": 588913133 }, { "epoch": 0.37284321080270066, "grad_norm": 0.8806619644165039, "learning_rate": 2e-05, "loss": 0.6635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5964, "tokens_per_second_per_gpu": 10361.05, "total_tokens": 589014443 }, { "epoch": 0.3729057264316079, "grad_norm": 0.9195500612258911, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5965, "tokens_per_second_per_gpu": 9911.96, "total_tokens": 589110790 }, { "epoch": 0.37296824206051515, "grad_norm": 0.9448845982551575, "learning_rate": 2e-05, "loss": 0.6777, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5966, "tokens_per_second_per_gpu": 10015.99, "total_tokens": 589205880 }, { "epoch": 0.37303075768942234, "grad_norm": 0.9011467099189758, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5967, "tokens_per_second_per_gpu": 9871.55, "total_tokens": 589303231 }, { "epoch": 0.3730932733183296, "grad_norm": 0.9315033555030823, "learning_rate": 2e-05, "loss": 0.6979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5968, "tokens_per_second_per_gpu": 10071.31, "total_tokens": 589400062 }, { "epoch": 0.37315578894723683, "grad_norm": 0.905916154384613, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5969, "tokens_per_second_per_gpu": 11209.0, "total_tokens": 589502214 }, { "epoch": 0.373218304576144, "grad_norm": 0.9118495583534241, "learning_rate": 2e-05, "loss": 0.6551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5970, "tokens_per_second_per_gpu": 11437.48, "total_tokens": 589602876 }, { "epoch": 0.37328082020505127, "grad_norm": 0.875739336013794, "learning_rate": 2e-05, "loss": 0.6689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5971, "tokens_per_second_per_gpu": 10821.59, "total_tokens": 589699992 }, { "epoch": 0.3733433358339585, "grad_norm": 0.9359530210494995, "learning_rate": 2e-05, "loss": 0.6694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5972, "tokens_per_second_per_gpu": 10957.42, "total_tokens": 589800698 }, { "epoch": 0.3734058514628657, "grad_norm": 0.9088734984397888, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5973, "tokens_per_second_per_gpu": 10479.77, "total_tokens": 589895822 }, { "epoch": 0.37346836709177295, "grad_norm": 0.969357430934906, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5974, "tokens_per_second_per_gpu": 10666.06, "total_tokens": 589996415 }, { "epoch": 0.3735308827206802, "grad_norm": 0.8922457098960876, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5975, "tokens_per_second_per_gpu": 10539.97, "total_tokens": 590091631 }, { "epoch": 0.3735933983495874, "grad_norm": 1.045122742652893, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5976, "tokens_per_second_per_gpu": 11566.14, "total_tokens": 590195028 }, { "epoch": 0.3736559139784946, "grad_norm": 0.9000153541564941, "learning_rate": 2e-05, "loss": 0.6781, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5977, "tokens_per_second_per_gpu": 10764.91, "total_tokens": 590296252 }, { "epoch": 0.37371842960740187, "grad_norm": 0.8904604911804199, "learning_rate": 2e-05, "loss": 0.7032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5978, "tokens_per_second_per_gpu": 11645.99, "total_tokens": 590401035 }, { "epoch": 0.37378094523630906, "grad_norm": 0.9092524647712708, "learning_rate": 2e-05, "loss": 0.6075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5979, "tokens_per_second_per_gpu": 10041.09, "total_tokens": 590492272 }, { "epoch": 0.3738434608652163, "grad_norm": 0.8771396279335022, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5980, "tokens_per_second_per_gpu": 9640.62, "total_tokens": 590590145 }, { "epoch": 0.37390597649412355, "grad_norm": 0.9147659540176392, "learning_rate": 2e-05, "loss": 0.6425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5981, "tokens_per_second_per_gpu": 10773.32, "total_tokens": 590689592 }, { "epoch": 0.37396849212303074, "grad_norm": 0.9001796245574951, "learning_rate": 2e-05, "loss": 0.6855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5982, "tokens_per_second_per_gpu": 10786.53, "total_tokens": 590792208 }, { "epoch": 0.374031007751938, "grad_norm": 0.8895965814590454, "learning_rate": 2e-05, "loss": 0.6737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5983, "tokens_per_second_per_gpu": 10934.67, "total_tokens": 590891202 }, { "epoch": 0.3740935233808452, "grad_norm": 0.8988028168678284, "learning_rate": 2e-05, "loss": 0.689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5984, "tokens_per_second_per_gpu": 11098.01, "total_tokens": 590990434 }, { "epoch": 0.3741560390097524, "grad_norm": 0.9671992063522339, "learning_rate": 2e-05, "loss": 0.677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5985, "tokens_per_second_per_gpu": 10440.17, "total_tokens": 591091700 }, { "epoch": 0.37421855463865966, "grad_norm": 0.9083348512649536, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5986, "tokens_per_second_per_gpu": 10117.18, "total_tokens": 591190616 }, { "epoch": 0.3742810702675669, "grad_norm": 0.955725908279419, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5987, "tokens_per_second_per_gpu": 10947.21, "total_tokens": 591289542 }, { "epoch": 0.3743435858964741, "grad_norm": 0.8649488687515259, "learning_rate": 2e-05, "loss": 0.622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5988, "tokens_per_second_per_gpu": 10718.39, "total_tokens": 591389723 }, { "epoch": 0.37440610152538134, "grad_norm": 0.9019325375556946, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5989, "tokens_per_second_per_gpu": 11280.11, "total_tokens": 591491684 }, { "epoch": 0.3744686171542886, "grad_norm": 0.9350320100784302, "learning_rate": 2e-05, "loss": 0.6651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5990, "tokens_per_second_per_gpu": 10757.39, "total_tokens": 591594044 }, { "epoch": 0.3745311327831958, "grad_norm": 0.9561691880226135, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5991, "tokens_per_second_per_gpu": 10384.11, "total_tokens": 591688353 }, { "epoch": 0.374593648412103, "grad_norm": 0.8911367654800415, "learning_rate": 2e-05, "loss": 0.6655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5992, "tokens_per_second_per_gpu": 10863.65, "total_tokens": 591787298 }, { "epoch": 0.37465616404101026, "grad_norm": 0.8818714022636414, "learning_rate": 2e-05, "loss": 0.6701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5993, "tokens_per_second_per_gpu": 10853.25, "total_tokens": 591888671 }, { "epoch": 0.37471867966991745, "grad_norm": 0.9108416438102722, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5994, "tokens_per_second_per_gpu": 11127.72, "total_tokens": 591993570 }, { "epoch": 0.3747811952988247, "grad_norm": 0.9317836165428162, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5995, "tokens_per_second_per_gpu": 11120.82, "total_tokens": 592093073 }, { "epoch": 0.37484371092773194, "grad_norm": 0.8889514207839966, "learning_rate": 2e-05, "loss": 0.6728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5996, "tokens_per_second_per_gpu": 11287.27, "total_tokens": 592197671 }, { "epoch": 0.37490622655663913, "grad_norm": 0.9380969405174255, "learning_rate": 2e-05, "loss": 0.6976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5997, "tokens_per_second_per_gpu": 11189.89, "total_tokens": 592296203 }, { "epoch": 0.3749687421855464, "grad_norm": 0.9303845167160034, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5998, "tokens_per_second_per_gpu": 10299.22, "total_tokens": 592394655 }, { "epoch": 0.3750312578144536, "grad_norm": 0.8428171277046204, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 5999, "tokens_per_second_per_gpu": 10671.04, "total_tokens": 592495465 }, { "epoch": 0.37509377344336087, "grad_norm": 0.9071078896522522, "learning_rate": 2e-05, "loss": 0.6655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6000, "tokens_per_second_per_gpu": 10072.55, "total_tokens": 592595032 }, { "epoch": 0.37515628907226806, "grad_norm": 0.8878654837608337, "learning_rate": 2e-05, "loss": 0.684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6001, "tokens_per_second_per_gpu": 11588.76, "total_tokens": 592699333 }, { "epoch": 0.3752188047011753, "grad_norm": 0.8904155492782593, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6002, "tokens_per_second_per_gpu": 10325.28, "total_tokens": 592799046 }, { "epoch": 0.37528132033008255, "grad_norm": 0.8838923573493958, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6003, "tokens_per_second_per_gpu": 11472.83, "total_tokens": 592900826 }, { "epoch": 0.37534383595898974, "grad_norm": 0.8759785294532776, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6004, "tokens_per_second_per_gpu": 11492.22, "total_tokens": 593003142 }, { "epoch": 0.375406351587897, "grad_norm": 0.8979405760765076, "learning_rate": 2e-05, "loss": 0.6656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6005, "tokens_per_second_per_gpu": 10383.32, "total_tokens": 593100398 }, { "epoch": 0.3754688672168042, "grad_norm": 0.8737428784370422, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6006, "tokens_per_second_per_gpu": 10030.85, "total_tokens": 593193004 }, { "epoch": 0.3755313828457114, "grad_norm": 0.8862380981445312, "learning_rate": 2e-05, "loss": 0.6676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6007, "tokens_per_second_per_gpu": 11127.87, "total_tokens": 593291709 }, { "epoch": 0.37559389847461866, "grad_norm": 0.8818439841270447, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6008, "tokens_per_second_per_gpu": 9988.87, "total_tokens": 593389850 }, { "epoch": 0.3756564141035259, "grad_norm": 0.8787375092506409, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6009, "tokens_per_second_per_gpu": 10290.98, "total_tokens": 593485220 }, { "epoch": 0.3757189297324331, "grad_norm": 0.892300546169281, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6010, "tokens_per_second_per_gpu": 9916.08, "total_tokens": 593581988 }, { "epoch": 0.37578144536134034, "grad_norm": 0.8932536244392395, "learning_rate": 2e-05, "loss": 0.6692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6011, "tokens_per_second_per_gpu": 10644.45, "total_tokens": 593682698 }, { "epoch": 0.3758439609902476, "grad_norm": 0.8579562902450562, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6012, "tokens_per_second_per_gpu": 10217.96, "total_tokens": 593779828 }, { "epoch": 0.3759064766191548, "grad_norm": 0.9031206965446472, "learning_rate": 2e-05, "loss": 0.6835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6013, "tokens_per_second_per_gpu": 10532.74, "total_tokens": 593881632 }, { "epoch": 0.375968992248062, "grad_norm": 0.9220521450042725, "learning_rate": 2e-05, "loss": 0.6785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6014, "tokens_per_second_per_gpu": 11093.61, "total_tokens": 593981946 }, { "epoch": 0.37603150787696926, "grad_norm": 0.9461113214492798, "learning_rate": 2e-05, "loss": 0.6943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6015, "tokens_per_second_per_gpu": 10874.41, "total_tokens": 594082386 }, { "epoch": 0.37609402350587645, "grad_norm": 0.9612008333206177, "learning_rate": 2e-05, "loss": 0.7104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6016, "tokens_per_second_per_gpu": 10599.9, "total_tokens": 594181663 }, { "epoch": 0.3761565391347837, "grad_norm": 0.8983285427093506, "learning_rate": 2e-05, "loss": 0.688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6017, "tokens_per_second_per_gpu": 10690.64, "total_tokens": 594281991 }, { "epoch": 0.37621905476369094, "grad_norm": 0.9034252762794495, "learning_rate": 2e-05, "loss": 0.722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6018, "tokens_per_second_per_gpu": 10571.1, "total_tokens": 594383643 }, { "epoch": 0.37628157039259813, "grad_norm": 0.8833739757537842, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6019, "tokens_per_second_per_gpu": 11197.77, "total_tokens": 594486193 }, { "epoch": 0.3763440860215054, "grad_norm": 0.9226329326629639, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6020, "tokens_per_second_per_gpu": 11110.99, "total_tokens": 594587709 }, { "epoch": 0.3764066016504126, "grad_norm": 0.893883466720581, "learning_rate": 2e-05, "loss": 0.6486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6021, "tokens_per_second_per_gpu": 10570.55, "total_tokens": 594685816 }, { "epoch": 0.3764691172793198, "grad_norm": 0.856843888759613, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6022, "tokens_per_second_per_gpu": 10307.81, "total_tokens": 594784896 }, { "epoch": 0.37653163290822705, "grad_norm": 0.8901041746139526, "learning_rate": 2e-05, "loss": 0.6561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6023, "tokens_per_second_per_gpu": 10589.14, "total_tokens": 594882855 }, { "epoch": 0.3765941485371343, "grad_norm": 0.9099535346031189, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6024, "tokens_per_second_per_gpu": 10426.39, "total_tokens": 594981292 }, { "epoch": 0.3766566641660415, "grad_norm": 0.9403946399688721, "learning_rate": 2e-05, "loss": 0.6499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6025, "tokens_per_second_per_gpu": 9732.11, "total_tokens": 595076096 }, { "epoch": 0.37671917979494873, "grad_norm": 0.9194720387458801, "learning_rate": 2e-05, "loss": 0.6721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6026, "tokens_per_second_per_gpu": 11577.7, "total_tokens": 595181474 }, { "epoch": 0.376781695423856, "grad_norm": 0.8795245289802551, "learning_rate": 2e-05, "loss": 0.6473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6027, "tokens_per_second_per_gpu": 10900.37, "total_tokens": 595281865 }, { "epoch": 0.37684421105276317, "grad_norm": 0.905428409576416, "learning_rate": 2e-05, "loss": 0.6548, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6028, "tokens_per_second_per_gpu": 10811.9, "total_tokens": 595377743 }, { "epoch": 0.3769067266816704, "grad_norm": 0.8726241588592529, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6029, "tokens_per_second_per_gpu": 10845.34, "total_tokens": 595477049 }, { "epoch": 0.37696924231057766, "grad_norm": 0.9545915126800537, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6030, "tokens_per_second_per_gpu": 10147.26, "total_tokens": 595571225 }, { "epoch": 0.37703175793948485, "grad_norm": 0.9373478293418884, "learning_rate": 2e-05, "loss": 0.6566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6031, "tokens_per_second_per_gpu": 11495.34, "total_tokens": 595672040 }, { "epoch": 0.3770942735683921, "grad_norm": 0.8994542956352234, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6032, "tokens_per_second_per_gpu": 9959.87, "total_tokens": 595767030 }, { "epoch": 0.37715678919729934, "grad_norm": 0.9046103358268738, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6033, "tokens_per_second_per_gpu": 10399.55, "total_tokens": 595866069 }, { "epoch": 0.3772193048262065, "grad_norm": 0.8614180088043213, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6034, "tokens_per_second_per_gpu": 11287.02, "total_tokens": 595966077 }, { "epoch": 0.37728182045511377, "grad_norm": 0.9340453147888184, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6035, "tokens_per_second_per_gpu": 9778.5, "total_tokens": 596062542 }, { "epoch": 0.377344336084021, "grad_norm": 0.8392145037651062, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6036, "tokens_per_second_per_gpu": 10913.36, "total_tokens": 596166254 }, { "epoch": 0.37740685171292826, "grad_norm": 0.8965038061141968, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6037, "tokens_per_second_per_gpu": 10699.02, "total_tokens": 596265346 }, { "epoch": 0.37746936734183545, "grad_norm": 0.8581210374832153, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6038, "tokens_per_second_per_gpu": 10693.06, "total_tokens": 596365668 }, { "epoch": 0.3775318829707427, "grad_norm": 0.9101329445838928, "learning_rate": 2e-05, "loss": 0.6669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6039, "tokens_per_second_per_gpu": 10332.1, "total_tokens": 596462485 }, { "epoch": 0.37759439859964994, "grad_norm": 0.9600066542625427, "learning_rate": 2e-05, "loss": 0.6845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6040, "tokens_per_second_per_gpu": 10095.16, "total_tokens": 596557613 }, { "epoch": 0.37765691422855713, "grad_norm": 0.8653564453125, "learning_rate": 2e-05, "loss": 0.6687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6041, "tokens_per_second_per_gpu": 11379.48, "total_tokens": 596662620 }, { "epoch": 0.3777194298574644, "grad_norm": 0.9224866628646851, "learning_rate": 2e-05, "loss": 0.6785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6042, "tokens_per_second_per_gpu": 9785.69, "total_tokens": 596757124 }, { "epoch": 0.3777819454863716, "grad_norm": 0.8645384907722473, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6043, "tokens_per_second_per_gpu": 10213.27, "total_tokens": 596858229 }, { "epoch": 0.3778444611152788, "grad_norm": 0.8713986873626709, "learning_rate": 2e-05, "loss": 0.6569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6044, "tokens_per_second_per_gpu": 10850.75, "total_tokens": 596960703 }, { "epoch": 0.37790697674418605, "grad_norm": 0.8772741556167603, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6045, "tokens_per_second_per_gpu": 10724.01, "total_tokens": 597060832 }, { "epoch": 0.3779694923730933, "grad_norm": 0.8532233238220215, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6046, "tokens_per_second_per_gpu": 10536.87, "total_tokens": 597161338 }, { "epoch": 0.3780320080020005, "grad_norm": 0.9087211489677429, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6047, "tokens_per_second_per_gpu": 10682.62, "total_tokens": 597261636 }, { "epoch": 0.37809452363090773, "grad_norm": 0.8849588632583618, "learning_rate": 2e-05, "loss": 0.7383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6048, "tokens_per_second_per_gpu": 11208.57, "total_tokens": 597367706 }, { "epoch": 0.378157039259815, "grad_norm": 0.8598145842552185, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6049, "tokens_per_second_per_gpu": 10341.19, "total_tokens": 597466214 }, { "epoch": 0.37821955488872216, "grad_norm": 0.893172562122345, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6050, "tokens_per_second_per_gpu": 9957.61, "total_tokens": 597558385 }, { "epoch": 0.3782820705176294, "grad_norm": 0.9102009534835815, "learning_rate": 2e-05, "loss": 0.6522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6051, "tokens_per_second_per_gpu": 9415.62, "total_tokens": 597652054 }, { "epoch": 0.37834458614653665, "grad_norm": 0.9195606708526611, "learning_rate": 2e-05, "loss": 0.6963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6052, "tokens_per_second_per_gpu": 9359.53, "total_tokens": 597748625 }, { "epoch": 0.37840710177544384, "grad_norm": 0.8974853157997131, "learning_rate": 2e-05, "loss": 0.6633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6053, "tokens_per_second_per_gpu": 10546.45, "total_tokens": 597847824 }, { "epoch": 0.3784696174043511, "grad_norm": 0.9032264947891235, "learning_rate": 2e-05, "loss": 0.6013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6054, "tokens_per_second_per_gpu": 9548.5, "total_tokens": 597940277 }, { "epoch": 0.37853213303325833, "grad_norm": 0.9038683176040649, "learning_rate": 2e-05, "loss": 0.6588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6055, "tokens_per_second_per_gpu": 10384.14, "total_tokens": 598035947 }, { "epoch": 0.3785946486621655, "grad_norm": 0.8854755163192749, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6056, "tokens_per_second_per_gpu": 10116.54, "total_tokens": 598133509 }, { "epoch": 0.37865716429107277, "grad_norm": 0.9095764756202698, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6057, "tokens_per_second_per_gpu": 9663.23, "total_tokens": 598228620 }, { "epoch": 0.37871967991998, "grad_norm": 0.8842476606369019, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6058, "tokens_per_second_per_gpu": 10306.16, "total_tokens": 598327050 }, { "epoch": 0.3787821955488872, "grad_norm": 0.8955934643745422, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6059, "tokens_per_second_per_gpu": 10590.42, "total_tokens": 598423424 }, { "epoch": 0.37884471117779445, "grad_norm": 0.8954613208770752, "learning_rate": 2e-05, "loss": 0.6767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6060, "tokens_per_second_per_gpu": 11220.81, "total_tokens": 598527113 }, { "epoch": 0.3789072268067017, "grad_norm": 0.8979066610336304, "learning_rate": 2e-05, "loss": 0.6818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6061, "tokens_per_second_per_gpu": 10794.35, "total_tokens": 598628523 }, { "epoch": 0.3789697424356089, "grad_norm": 0.8664966821670532, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6062, "tokens_per_second_per_gpu": 9993.38, "total_tokens": 598725358 }, { "epoch": 0.3790322580645161, "grad_norm": 1.0175490379333496, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6063, "tokens_per_second_per_gpu": 10033.75, "total_tokens": 598818201 }, { "epoch": 0.37909477369342337, "grad_norm": 0.8977495431900024, "learning_rate": 2e-05, "loss": 0.6874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6064, "tokens_per_second_per_gpu": 10295.69, "total_tokens": 598917268 }, { "epoch": 0.37915728932233056, "grad_norm": 0.9064566493034363, "learning_rate": 2e-05, "loss": 0.6661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6065, "tokens_per_second_per_gpu": 10548.44, "total_tokens": 599012670 }, { "epoch": 0.3792198049512378, "grad_norm": 0.928997278213501, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6066, "tokens_per_second_per_gpu": 10396.95, "total_tokens": 599109230 }, { "epoch": 0.37928232058014505, "grad_norm": 0.8930960297584534, "learning_rate": 2e-05, "loss": 0.7254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6067, "tokens_per_second_per_gpu": 10970.25, "total_tokens": 599214155 }, { "epoch": 0.37934483620905224, "grad_norm": 0.9365675449371338, "learning_rate": 2e-05, "loss": 0.6941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6068, "tokens_per_second_per_gpu": 10077.94, "total_tokens": 599314084 }, { "epoch": 0.3794073518379595, "grad_norm": 0.9043465852737427, "learning_rate": 2e-05, "loss": 0.6886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6069, "tokens_per_second_per_gpu": 11334.0, "total_tokens": 599419400 }, { "epoch": 0.37946986746686673, "grad_norm": 0.8818067312240601, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6070, "tokens_per_second_per_gpu": 10902.81, "total_tokens": 599518362 }, { "epoch": 0.3795323830957739, "grad_norm": 0.8850386738777161, "learning_rate": 2e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6071, "tokens_per_second_per_gpu": 9829.36, "total_tokens": 599615958 }, { "epoch": 0.37959489872468116, "grad_norm": 0.860115647315979, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6072, "tokens_per_second_per_gpu": 10473.11, "total_tokens": 599715747 }, { "epoch": 0.3796574143535884, "grad_norm": 0.9012784957885742, "learning_rate": 2e-05, "loss": 0.6721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6073, "tokens_per_second_per_gpu": 9401.98, "total_tokens": 599810015 }, { "epoch": 0.3797199299824956, "grad_norm": 0.8796226978302002, "learning_rate": 2e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6074, "tokens_per_second_per_gpu": 10085.26, "total_tokens": 599910461 }, { "epoch": 0.37978244561140284, "grad_norm": 0.8931906819343567, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6075, "tokens_per_second_per_gpu": 11064.38, "total_tokens": 600012175 }, { "epoch": 0.3798449612403101, "grad_norm": 0.8992539644241333, "learning_rate": 2e-05, "loss": 0.6302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6076, "tokens_per_second_per_gpu": 10256.21, "total_tokens": 600106460 }, { "epoch": 0.37990747686921733, "grad_norm": 0.9168246984481812, "learning_rate": 2e-05, "loss": 0.6862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6077, "tokens_per_second_per_gpu": 10030.2, "total_tokens": 600206335 }, { "epoch": 0.3799699924981245, "grad_norm": 0.9263179302215576, "learning_rate": 2e-05, "loss": 0.7057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6078, "tokens_per_second_per_gpu": 10209.54, "total_tokens": 600303765 }, { "epoch": 0.38003250812703177, "grad_norm": 0.9247167110443115, "learning_rate": 2e-05, "loss": 0.6792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6079, "tokens_per_second_per_gpu": 10528.3, "total_tokens": 600397397 }, { "epoch": 0.380095023755939, "grad_norm": 0.8862858414649963, "learning_rate": 2e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6080, "tokens_per_second_per_gpu": 10199.87, "total_tokens": 600497030 }, { "epoch": 0.3801575393848462, "grad_norm": 0.9039560556411743, "learning_rate": 2e-05, "loss": 0.6769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6081, "tokens_per_second_per_gpu": 10601.8, "total_tokens": 600601251 }, { "epoch": 0.38022005501375344, "grad_norm": 0.8724515438079834, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6082, "tokens_per_second_per_gpu": 10462.15, "total_tokens": 600697870 }, { "epoch": 0.3802825706426607, "grad_norm": 0.84490567445755, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6083, "tokens_per_second_per_gpu": 11197.03, "total_tokens": 600799285 }, { "epoch": 0.3803450862715679, "grad_norm": 0.8944807648658752, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6084, "tokens_per_second_per_gpu": 10426.17, "total_tokens": 600894945 }, { "epoch": 0.3804076019004751, "grad_norm": 0.8972856998443604, "learning_rate": 2e-05, "loss": 0.65, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6085, "tokens_per_second_per_gpu": 10242.85, "total_tokens": 600991783 }, { "epoch": 0.38047011752938237, "grad_norm": 0.9209941625595093, "learning_rate": 2e-05, "loss": 0.7358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6086, "tokens_per_second_per_gpu": 10700.67, "total_tokens": 601093133 }, { "epoch": 0.38053263315828956, "grad_norm": 0.9032102227210999, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6087, "tokens_per_second_per_gpu": 10680.34, "total_tokens": 601189131 }, { "epoch": 0.3805951487871968, "grad_norm": 0.929764986038208, "learning_rate": 2e-05, "loss": 0.6706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6088, "tokens_per_second_per_gpu": 10357.4, "total_tokens": 601283898 }, { "epoch": 0.38065766441610405, "grad_norm": 0.8985298275947571, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6089, "tokens_per_second_per_gpu": 9953.91, "total_tokens": 601379209 }, { "epoch": 0.38072018004501124, "grad_norm": 0.8710746765136719, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6090, "tokens_per_second_per_gpu": 10154.86, "total_tokens": 601476009 }, { "epoch": 0.3807826956739185, "grad_norm": 0.8965381383895874, "learning_rate": 2e-05, "loss": 0.6576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6091, "tokens_per_second_per_gpu": 10041.81, "total_tokens": 601573076 }, { "epoch": 0.3808452113028257, "grad_norm": 0.8986281752586365, "learning_rate": 2e-05, "loss": 0.6364, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6092, "tokens_per_second_per_gpu": 10283.2, "total_tokens": 601670931 }, { "epoch": 0.3809077269317329, "grad_norm": 0.9034750461578369, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6093, "tokens_per_second_per_gpu": 9486.34, "total_tokens": 601765574 }, { "epoch": 0.38097024256064016, "grad_norm": 0.9012333154678345, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6094, "tokens_per_second_per_gpu": 10573.66, "total_tokens": 601863230 }, { "epoch": 0.3810327581895474, "grad_norm": 0.9204564690589905, "learning_rate": 2e-05, "loss": 0.7051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6095, "tokens_per_second_per_gpu": 10694.66, "total_tokens": 601960049 }, { "epoch": 0.3810952738184546, "grad_norm": 0.9044433832168579, "learning_rate": 2e-05, "loss": 0.6798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6096, "tokens_per_second_per_gpu": 10964.9, "total_tokens": 602061335 }, { "epoch": 0.38115778944736184, "grad_norm": 1.0459319353103638, "learning_rate": 2e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6097, "tokens_per_second_per_gpu": 10376.07, "total_tokens": 602157769 }, { "epoch": 0.3812203050762691, "grad_norm": 0.8815923929214478, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6098, "tokens_per_second_per_gpu": 9724.92, "total_tokens": 602254899 }, { "epoch": 0.3812828207051763, "grad_norm": 0.8780354261398315, "learning_rate": 2e-05, "loss": 0.6829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6099, "tokens_per_second_per_gpu": 10261.36, "total_tokens": 602354302 }, { "epoch": 0.3813453363340835, "grad_norm": 0.8804859519004822, "learning_rate": 2e-05, "loss": 0.5947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6100, "tokens_per_second_per_gpu": 9518.95, "total_tokens": 602442863 }, { "epoch": 0.38140785196299076, "grad_norm": 0.8564162254333496, "learning_rate": 2e-05, "loss": 0.6422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6101, "tokens_per_second_per_gpu": 10459.32, "total_tokens": 602541437 }, { "epoch": 0.38147036759189795, "grad_norm": 0.9087023735046387, "learning_rate": 2e-05, "loss": 0.676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6102, "tokens_per_second_per_gpu": 9761.62, "total_tokens": 602631596 }, { "epoch": 0.3815328832208052, "grad_norm": 0.895291805267334, "learning_rate": 2e-05, "loss": 0.6635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6103, "tokens_per_second_per_gpu": 10051.05, "total_tokens": 602728673 }, { "epoch": 0.38159539884971244, "grad_norm": 0.8788176774978638, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6104, "tokens_per_second_per_gpu": 11192.29, "total_tokens": 602828715 }, { "epoch": 0.38165791447861963, "grad_norm": 0.8969796895980835, "learning_rate": 2e-05, "loss": 0.6818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6105, "tokens_per_second_per_gpu": 9917.91, "total_tokens": 602923647 }, { "epoch": 0.3817204301075269, "grad_norm": 0.857045590877533, "learning_rate": 2e-05, "loss": 0.6698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6106, "tokens_per_second_per_gpu": 10550.17, "total_tokens": 603024009 }, { "epoch": 0.3817829457364341, "grad_norm": 0.8625147938728333, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6107, "tokens_per_second_per_gpu": 10827.83, "total_tokens": 603125322 }, { "epoch": 0.3818454613653413, "grad_norm": 0.9320398569107056, "learning_rate": 2e-05, "loss": 0.7703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6108, "tokens_per_second_per_gpu": 10276.0, "total_tokens": 603224840 }, { "epoch": 0.38190797699424855, "grad_norm": 0.8956214189529419, "learning_rate": 2e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6109, "tokens_per_second_per_gpu": 10012.59, "total_tokens": 603320021 }, { "epoch": 0.3819704926231558, "grad_norm": 0.9106471538543701, "learning_rate": 2e-05, "loss": 0.6867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6110, "tokens_per_second_per_gpu": 10842.84, "total_tokens": 603419006 }, { "epoch": 0.382033008252063, "grad_norm": 0.9048755168914795, "learning_rate": 2e-05, "loss": 0.6481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6111, "tokens_per_second_per_gpu": 10145.12, "total_tokens": 603514011 }, { "epoch": 0.38209552388097023, "grad_norm": 0.8703773021697998, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6112, "tokens_per_second_per_gpu": 10971.64, "total_tokens": 603614407 }, { "epoch": 0.3821580395098775, "grad_norm": 0.8601861596107483, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6113, "tokens_per_second_per_gpu": 10560.4, "total_tokens": 603714553 }, { "epoch": 0.3822205551387847, "grad_norm": 0.9144816994667053, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6114, "tokens_per_second_per_gpu": 10127.71, "total_tokens": 603809210 }, { "epoch": 0.3822830707676919, "grad_norm": 0.9510620832443237, "learning_rate": 2e-05, "loss": 0.6881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6115, "tokens_per_second_per_gpu": 10098.5, "total_tokens": 603906105 }, { "epoch": 0.38234558639659916, "grad_norm": 0.892260730266571, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6116, "tokens_per_second_per_gpu": 9761.57, "total_tokens": 604001644 }, { "epoch": 0.3824081020255064, "grad_norm": 0.998849630355835, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6117, "tokens_per_second_per_gpu": 10254.93, "total_tokens": 604098950 }, { "epoch": 0.3824706176544136, "grad_norm": 0.9160689115524292, "learning_rate": 2e-05, "loss": 0.6986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6118, "tokens_per_second_per_gpu": 9928.92, "total_tokens": 604197834 }, { "epoch": 0.38253313328332084, "grad_norm": 0.9498291611671448, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6119, "tokens_per_second_per_gpu": 9324.67, "total_tokens": 604285248 }, { "epoch": 0.3825956489122281, "grad_norm": 0.8946958780288696, "learning_rate": 2e-05, "loss": 0.6685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6120, "tokens_per_second_per_gpu": 10318.17, "total_tokens": 604381767 }, { "epoch": 0.38265816454113527, "grad_norm": 0.8756511211395264, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6121, "tokens_per_second_per_gpu": 10395.22, "total_tokens": 604481387 }, { "epoch": 0.3827206801700425, "grad_norm": 0.8736767172813416, "learning_rate": 2e-05, "loss": 0.5959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6122, "tokens_per_second_per_gpu": 9613.36, "total_tokens": 604575025 }, { "epoch": 0.38278319579894976, "grad_norm": 0.9284313321113586, "learning_rate": 2e-05, "loss": 0.6913, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6123, "tokens_per_second_per_gpu": 9976.23, "total_tokens": 604670836 }, { "epoch": 0.38284571142785695, "grad_norm": 0.9188103675842285, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6124, "tokens_per_second_per_gpu": 10245.71, "total_tokens": 604765318 }, { "epoch": 0.3829082270567642, "grad_norm": 0.889227032661438, "learning_rate": 2e-05, "loss": 0.6603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6125, "tokens_per_second_per_gpu": 11268.36, "total_tokens": 604868013 }, { "epoch": 0.38297074268567144, "grad_norm": 0.9221677184104919, "learning_rate": 2e-05, "loss": 0.6425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6126, "tokens_per_second_per_gpu": 10171.27, "total_tokens": 604960272 }, { "epoch": 0.38303325831457863, "grad_norm": 0.9191535711288452, "learning_rate": 2e-05, "loss": 0.6769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6127, "tokens_per_second_per_gpu": 10044.42, "total_tokens": 605057282 }, { "epoch": 0.3830957739434859, "grad_norm": 0.9147822856903076, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6128, "tokens_per_second_per_gpu": 10423.86, "total_tokens": 605151541 }, { "epoch": 0.3831582895723931, "grad_norm": 0.9300971031188965, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6129, "tokens_per_second_per_gpu": 10222.21, "total_tokens": 605244237 }, { "epoch": 0.3832208052013003, "grad_norm": 0.9599201679229736, "learning_rate": 2e-05, "loss": 0.6518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6130, "tokens_per_second_per_gpu": 10557.99, "total_tokens": 605341874 }, { "epoch": 0.38328332083020755, "grad_norm": 0.9321171641349792, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6131, "tokens_per_second_per_gpu": 10394.69, "total_tokens": 605443148 }, { "epoch": 0.3833458364591148, "grad_norm": 0.9216844439506531, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6132, "tokens_per_second_per_gpu": 10174.84, "total_tokens": 605533879 }, { "epoch": 0.383408352088022, "grad_norm": 0.9058029055595398, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6133, "tokens_per_second_per_gpu": 10272.91, "total_tokens": 605634701 }, { "epoch": 0.38347086771692923, "grad_norm": 0.9364041686058044, "learning_rate": 2e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6134, "tokens_per_second_per_gpu": 9842.32, "total_tokens": 605730661 }, { "epoch": 0.3835333833458365, "grad_norm": 0.9479724168777466, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6135, "tokens_per_second_per_gpu": 10042.85, "total_tokens": 605823667 }, { "epoch": 0.38359589897474367, "grad_norm": 0.8993833065032959, "learning_rate": 2e-05, "loss": 0.6374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6136, "tokens_per_second_per_gpu": 10334.61, "total_tokens": 605924441 }, { "epoch": 0.3836584146036509, "grad_norm": 0.9536347985267639, "learning_rate": 2e-05, "loss": 0.6771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6137, "tokens_per_second_per_gpu": 10895.97, "total_tokens": 606024761 }, { "epoch": 0.38372093023255816, "grad_norm": 1.0160223245620728, "learning_rate": 2e-05, "loss": 0.6325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6138, "tokens_per_second_per_gpu": 11395.69, "total_tokens": 606121954 }, { "epoch": 0.38378344586146534, "grad_norm": 0.8972322344779968, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6139, "tokens_per_second_per_gpu": 10126.5, "total_tokens": 606216218 }, { "epoch": 0.3838459614903726, "grad_norm": 0.9950000047683716, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6140, "tokens_per_second_per_gpu": 10099.07, "total_tokens": 606312526 }, { "epoch": 0.38390847711927983, "grad_norm": 0.929010808467865, "learning_rate": 2e-05, "loss": 0.6508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6141, "tokens_per_second_per_gpu": 9742.75, "total_tokens": 606405911 }, { "epoch": 0.383970992748187, "grad_norm": 0.9367231726646423, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6142, "tokens_per_second_per_gpu": 9470.85, "total_tokens": 606500690 }, { "epoch": 0.38403350837709427, "grad_norm": 0.9068073630332947, "learning_rate": 2e-05, "loss": 0.6872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6143, "tokens_per_second_per_gpu": 10875.16, "total_tokens": 606600227 }, { "epoch": 0.3840960240060015, "grad_norm": 0.9895057678222656, "learning_rate": 2e-05, "loss": 0.6899, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6144, "tokens_per_second_per_gpu": 10909.39, "total_tokens": 606699987 }, { "epoch": 0.3841585396349087, "grad_norm": 0.8806506395339966, "learning_rate": 2e-05, "loss": 0.6206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6145, "tokens_per_second_per_gpu": 10587.34, "total_tokens": 606799972 }, { "epoch": 0.38422105526381595, "grad_norm": 0.9220358729362488, "learning_rate": 2e-05, "loss": 0.6888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6146, "tokens_per_second_per_gpu": 10567.9, "total_tokens": 606899980 }, { "epoch": 0.3842835708927232, "grad_norm": 0.9168636202812195, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6147, "tokens_per_second_per_gpu": 10011.11, "total_tokens": 606994427 }, { "epoch": 0.3843460865216304, "grad_norm": 0.8869627714157104, "learning_rate": 2e-05, "loss": 0.6548, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6148, "tokens_per_second_per_gpu": 10177.32, "total_tokens": 607092238 }, { "epoch": 0.3844086021505376, "grad_norm": 0.8953045606613159, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6149, "tokens_per_second_per_gpu": 10629.45, "total_tokens": 607192324 }, { "epoch": 0.38447111777944487, "grad_norm": 0.8489092588424683, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6150, "tokens_per_second_per_gpu": 10572.12, "total_tokens": 607291278 }, { "epoch": 0.3845336334083521, "grad_norm": 0.8716083765029907, "learning_rate": 2e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6151, "tokens_per_second_per_gpu": 10827.95, "total_tokens": 607391399 }, { "epoch": 0.3845961490372593, "grad_norm": 0.8909577131271362, "learning_rate": 2e-05, "loss": 0.6738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6152, "tokens_per_second_per_gpu": 10677.99, "total_tokens": 607493539 }, { "epoch": 0.38465866466616655, "grad_norm": 0.9382730722427368, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6153, "tokens_per_second_per_gpu": 10108.1, "total_tokens": 607589655 }, { "epoch": 0.3847211802950738, "grad_norm": 0.9101101160049438, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6154, "tokens_per_second_per_gpu": 10550.49, "total_tokens": 607688035 }, { "epoch": 0.384783695923981, "grad_norm": 0.9018828868865967, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6155, "tokens_per_second_per_gpu": 10050.95, "total_tokens": 607781401 }, { "epoch": 0.38484621155288823, "grad_norm": 0.9302418231964111, "learning_rate": 2e-05, "loss": 0.6989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6156, "tokens_per_second_per_gpu": 10762.66, "total_tokens": 607880546 }, { "epoch": 0.3849087271817955, "grad_norm": 0.8883625864982605, "learning_rate": 2e-05, "loss": 0.6508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6157, "tokens_per_second_per_gpu": 10736.77, "total_tokens": 607978157 }, { "epoch": 0.38497124281070266, "grad_norm": 0.9133388996124268, "learning_rate": 2e-05, "loss": 0.6877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6158, "tokens_per_second_per_gpu": 9318.23, "total_tokens": 608071396 }, { "epoch": 0.3850337584396099, "grad_norm": 0.8665643930435181, "learning_rate": 2e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6159, "tokens_per_second_per_gpu": 11101.97, "total_tokens": 608169713 }, { "epoch": 0.38509627406851715, "grad_norm": 0.937944769859314, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6160, "tokens_per_second_per_gpu": 10043.35, "total_tokens": 608264552 }, { "epoch": 0.38515878969742434, "grad_norm": 0.8842713832855225, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6161, "tokens_per_second_per_gpu": 9716.48, "total_tokens": 608360404 }, { "epoch": 0.3852213053263316, "grad_norm": 0.8878777027130127, "learning_rate": 2e-05, "loss": 0.7092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6162, "tokens_per_second_per_gpu": 10677.78, "total_tokens": 608460758 }, { "epoch": 0.38528382095523883, "grad_norm": 0.9029558897018433, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6163, "tokens_per_second_per_gpu": 10251.35, "total_tokens": 608556502 }, { "epoch": 0.385346336584146, "grad_norm": 0.8849307894706726, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6164, "tokens_per_second_per_gpu": 9846.39, "total_tokens": 608649799 }, { "epoch": 0.38540885221305327, "grad_norm": 0.9112111926078796, "learning_rate": 2e-05, "loss": 0.6981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6165, "tokens_per_second_per_gpu": 9991.05, "total_tokens": 608747901 }, { "epoch": 0.3854713678419605, "grad_norm": 0.9143921136856079, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6166, "tokens_per_second_per_gpu": 10632.22, "total_tokens": 608845668 }, { "epoch": 0.3855338834708677, "grad_norm": 0.8817989826202393, "learning_rate": 2e-05, "loss": 0.6612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6167, "tokens_per_second_per_gpu": 10975.5, "total_tokens": 608946526 }, { "epoch": 0.38559639909977494, "grad_norm": 0.9156681895256042, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6168, "tokens_per_second_per_gpu": 10020.67, "total_tokens": 609041109 }, { "epoch": 0.3856589147286822, "grad_norm": 0.8801187872886658, "learning_rate": 2e-05, "loss": 0.6872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6169, "tokens_per_second_per_gpu": 10648.62, "total_tokens": 609141460 }, { "epoch": 0.3857214303575894, "grad_norm": 0.9035972356796265, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6170, "tokens_per_second_per_gpu": 10027.04, "total_tokens": 609233268 }, { "epoch": 0.3857839459864966, "grad_norm": 0.8881092071533203, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6171, "tokens_per_second_per_gpu": 10588.67, "total_tokens": 609332723 }, { "epoch": 0.38584646161540387, "grad_norm": 0.9097373485565186, "learning_rate": 2e-05, "loss": 0.6615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6172, "tokens_per_second_per_gpu": 10373.8, "total_tokens": 609429415 }, { "epoch": 0.38590897724431106, "grad_norm": 0.8949248790740967, "learning_rate": 2e-05, "loss": 0.6752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6173, "tokens_per_second_per_gpu": 9926.46, "total_tokens": 609524450 }, { "epoch": 0.3859714928732183, "grad_norm": 0.9147812128067017, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6174, "tokens_per_second_per_gpu": 10162.15, "total_tokens": 609617923 }, { "epoch": 0.38603400850212555, "grad_norm": 0.8670189380645752, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6175, "tokens_per_second_per_gpu": 10677.3, "total_tokens": 609717066 }, { "epoch": 0.38609652413103274, "grad_norm": 0.8903567790985107, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6176, "tokens_per_second_per_gpu": 10942.45, "total_tokens": 609817594 }, { "epoch": 0.38615903975994, "grad_norm": 0.8993700742721558, "learning_rate": 2e-05, "loss": 0.6708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6177, "tokens_per_second_per_gpu": 10898.05, "total_tokens": 609916547 }, { "epoch": 0.3862215553888472, "grad_norm": 0.9172121286392212, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6178, "tokens_per_second_per_gpu": 10006.17, "total_tokens": 610013235 }, { "epoch": 0.3862840710177544, "grad_norm": 0.8977596163749695, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6179, "tokens_per_second_per_gpu": 11508.22, "total_tokens": 610110699 }, { "epoch": 0.38634658664666166, "grad_norm": 0.9455530643463135, "learning_rate": 2e-05, "loss": 0.6549, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6180, "tokens_per_second_per_gpu": 12618.66, "total_tokens": 610202588 }, { "epoch": 0.3864091022755689, "grad_norm": 0.9158532619476318, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6181, "tokens_per_second_per_gpu": 10833.72, "total_tokens": 610294805 }, { "epoch": 0.3864716179044761, "grad_norm": 0.939917266368866, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6182, "tokens_per_second_per_gpu": 10425.29, "total_tokens": 610388326 }, { "epoch": 0.38653413353338334, "grad_norm": 0.8709375262260437, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6183, "tokens_per_second_per_gpu": 10711.93, "total_tokens": 610487440 }, { "epoch": 0.3865966491622906, "grad_norm": 0.9257996678352356, "learning_rate": 2e-05, "loss": 0.6856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6184, "tokens_per_second_per_gpu": 10190.2, "total_tokens": 610580391 }, { "epoch": 0.3866591647911978, "grad_norm": 0.972592830657959, "learning_rate": 2e-05, "loss": 0.6731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6185, "tokens_per_second_per_gpu": 11116.46, "total_tokens": 610674160 }, { "epoch": 0.386721680420105, "grad_norm": 0.8907822966575623, "learning_rate": 2e-05, "loss": 0.6467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6186, "tokens_per_second_per_gpu": 10829.73, "total_tokens": 610772408 }, { "epoch": 0.38678419604901226, "grad_norm": 0.9654648900032043, "learning_rate": 2e-05, "loss": 0.6981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6187, "tokens_per_second_per_gpu": 10140.2, "total_tokens": 610868236 }, { "epoch": 0.38684671167791945, "grad_norm": 0.8678687810897827, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6188, "tokens_per_second_per_gpu": 10218.65, "total_tokens": 610964633 }, { "epoch": 0.3869092273068267, "grad_norm": 0.9149739742279053, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6189, "tokens_per_second_per_gpu": 11341.16, "total_tokens": 611065530 }, { "epoch": 0.38697174293573394, "grad_norm": 0.8645495176315308, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6190, "tokens_per_second_per_gpu": 10445.45, "total_tokens": 611163428 }, { "epoch": 0.3870342585646412, "grad_norm": 0.9060890078544617, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6191, "tokens_per_second_per_gpu": 10932.87, "total_tokens": 611260222 }, { "epoch": 0.3870967741935484, "grad_norm": 0.8877385258674622, "learning_rate": 2e-05, "loss": 0.675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6192, "tokens_per_second_per_gpu": 10934.33, "total_tokens": 611358768 }, { "epoch": 0.3871592898224556, "grad_norm": 0.8838620185852051, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6193, "tokens_per_second_per_gpu": 9719.21, "total_tokens": 611455111 }, { "epoch": 0.38722180545136287, "grad_norm": 0.9045339822769165, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6194, "tokens_per_second_per_gpu": 10217.78, "total_tokens": 611549789 }, { "epoch": 0.38728432108027006, "grad_norm": 0.945842444896698, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6195, "tokens_per_second_per_gpu": 10080.38, "total_tokens": 611644474 }, { "epoch": 0.3873468367091773, "grad_norm": 0.9234296679496765, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6196, "tokens_per_second_per_gpu": 10782.02, "total_tokens": 611743129 }, { "epoch": 0.38740935233808454, "grad_norm": 0.913854718208313, "learning_rate": 2e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6197, "tokens_per_second_per_gpu": 9347.03, "total_tokens": 611837047 }, { "epoch": 0.38747186796699173, "grad_norm": 0.8856122493743896, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6198, "tokens_per_second_per_gpu": 10651.26, "total_tokens": 611937175 }, { "epoch": 0.387534383595899, "grad_norm": 0.9085307717323303, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6199, "tokens_per_second_per_gpu": 9349.48, "total_tokens": 612031245 }, { "epoch": 0.3875968992248062, "grad_norm": 0.9197708964347839, "learning_rate": 2e-05, "loss": 0.6695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6200, "tokens_per_second_per_gpu": 9933.35, "total_tokens": 612127387 }, { "epoch": 0.3876594148537134, "grad_norm": 0.9904439449310303, "learning_rate": 2e-05, "loss": 0.6782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6201, "tokens_per_second_per_gpu": 9756.68, "total_tokens": 612222701 }, { "epoch": 0.38772193048262066, "grad_norm": 0.8849842548370361, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6202, "tokens_per_second_per_gpu": 9647.77, "total_tokens": 612318454 }, { "epoch": 0.3877844461115279, "grad_norm": 0.9368736743927002, "learning_rate": 2e-05, "loss": 0.6946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6203, "tokens_per_second_per_gpu": 11297.59, "total_tokens": 612419196 }, { "epoch": 0.3878469617404351, "grad_norm": 0.9158841967582703, "learning_rate": 2e-05, "loss": 0.6791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6204, "tokens_per_second_per_gpu": 9727.83, "total_tokens": 612520614 }, { "epoch": 0.38790947736934234, "grad_norm": 0.9311093688011169, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6205, "tokens_per_second_per_gpu": 9969.38, "total_tokens": 612614280 }, { "epoch": 0.3879719929982496, "grad_norm": 0.9403015971183777, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6206, "tokens_per_second_per_gpu": 10309.48, "total_tokens": 612711188 }, { "epoch": 0.38803450862715677, "grad_norm": 0.864399790763855, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6207, "tokens_per_second_per_gpu": 10211.23, "total_tokens": 612809956 }, { "epoch": 0.388097024256064, "grad_norm": 0.9307365417480469, "learning_rate": 2e-05, "loss": 0.6793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6208, "tokens_per_second_per_gpu": 9444.66, "total_tokens": 612907511 }, { "epoch": 0.38815953988497126, "grad_norm": 0.9664154052734375, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6209, "tokens_per_second_per_gpu": 9660.93, "total_tokens": 613001711 }, { "epoch": 0.38822205551387845, "grad_norm": 0.9482241272926331, "learning_rate": 2e-05, "loss": 0.6835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6210, "tokens_per_second_per_gpu": 10707.83, "total_tokens": 613096600 }, { "epoch": 0.3882845711427857, "grad_norm": 0.8710803985595703, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6211, "tokens_per_second_per_gpu": 10694.6, "total_tokens": 613196246 }, { "epoch": 0.38834708677169294, "grad_norm": 0.89296555519104, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6212, "tokens_per_second_per_gpu": 11304.28, "total_tokens": 613296041 }, { "epoch": 0.38840960240060013, "grad_norm": 0.961418867111206, "learning_rate": 2e-05, "loss": 0.6952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6213, "tokens_per_second_per_gpu": 10971.46, "total_tokens": 613394636 }, { "epoch": 0.3884721180295074, "grad_norm": 0.9341582655906677, "learning_rate": 2e-05, "loss": 0.6709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6214, "tokens_per_second_per_gpu": 10240.05, "total_tokens": 613492032 }, { "epoch": 0.3885346336584146, "grad_norm": 0.8890624642372131, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6215, "tokens_per_second_per_gpu": 9225.08, "total_tokens": 613580428 }, { "epoch": 0.3885971492873218, "grad_norm": 0.9082399606704712, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6216, "tokens_per_second_per_gpu": 10545.7, "total_tokens": 613679756 }, { "epoch": 0.38865966491622905, "grad_norm": 0.901868462562561, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6217, "tokens_per_second_per_gpu": 10710.16, "total_tokens": 613777948 }, { "epoch": 0.3887221805451363, "grad_norm": 0.9424877762794495, "learning_rate": 2e-05, "loss": 0.6516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6218, "tokens_per_second_per_gpu": 10281.26, "total_tokens": 613874963 }, { "epoch": 0.3887846961740435, "grad_norm": 0.9024754762649536, "learning_rate": 2e-05, "loss": 0.6951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6219, "tokens_per_second_per_gpu": 10593.08, "total_tokens": 613972108 }, { "epoch": 0.38884721180295073, "grad_norm": 0.8892279863357544, "learning_rate": 2e-05, "loss": 0.6487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6220, "tokens_per_second_per_gpu": 10952.06, "total_tokens": 614071836 }, { "epoch": 0.388909727431858, "grad_norm": 0.9668615460395813, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6221, "tokens_per_second_per_gpu": 9959.78, "total_tokens": 614168285 }, { "epoch": 0.38897224306076517, "grad_norm": 0.9193543195724487, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6222, "tokens_per_second_per_gpu": 10561.82, "total_tokens": 614264817 }, { "epoch": 0.3890347586896724, "grad_norm": 0.8902843594551086, "learning_rate": 2e-05, "loss": 0.6868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6223, "tokens_per_second_per_gpu": 10830.54, "total_tokens": 614363425 }, { "epoch": 0.38909727431857966, "grad_norm": 0.8901088833808899, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6224, "tokens_per_second_per_gpu": 10632.51, "total_tokens": 614459235 }, { "epoch": 0.38915978994748684, "grad_norm": 0.9054637551307678, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6225, "tokens_per_second_per_gpu": 10302.67, "total_tokens": 614557836 }, { "epoch": 0.3892223055763941, "grad_norm": 0.8862048387527466, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6226, "tokens_per_second_per_gpu": 10449.53, "total_tokens": 614655850 }, { "epoch": 0.38928482120530133, "grad_norm": 0.9294852614402771, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6227, "tokens_per_second_per_gpu": 11240.58, "total_tokens": 614752580 }, { "epoch": 0.3893473368342086, "grad_norm": 0.8975330591201782, "learning_rate": 2e-05, "loss": 0.6428, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6228, "tokens_per_second_per_gpu": 10748.5, "total_tokens": 614844854 }, { "epoch": 0.38940985246311577, "grad_norm": 0.8885515928268433, "learning_rate": 2e-05, "loss": 0.6977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6229, "tokens_per_second_per_gpu": 10593.14, "total_tokens": 614944279 }, { "epoch": 0.389472368092023, "grad_norm": 0.9090505242347717, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6230, "tokens_per_second_per_gpu": 10898.32, "total_tokens": 615045823 }, { "epoch": 0.38953488372093026, "grad_norm": 0.930353045463562, "learning_rate": 2e-05, "loss": 0.6596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6231, "tokens_per_second_per_gpu": 10799.86, "total_tokens": 615144022 }, { "epoch": 0.38959739934983745, "grad_norm": 0.9466739296913147, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6232, "tokens_per_second_per_gpu": 10087.71, "total_tokens": 615237360 }, { "epoch": 0.3896599149787447, "grad_norm": 0.9793422818183899, "learning_rate": 2e-05, "loss": 0.6906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6233, "tokens_per_second_per_gpu": 11176.76, "total_tokens": 615332706 }, { "epoch": 0.38972243060765194, "grad_norm": 0.8952852487564087, "learning_rate": 2e-05, "loss": 0.7086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6234, "tokens_per_second_per_gpu": 10469.04, "total_tokens": 615433366 }, { "epoch": 0.3897849462365591, "grad_norm": 0.8982407450675964, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6235, "tokens_per_second_per_gpu": 10184.44, "total_tokens": 615531858 }, { "epoch": 0.38984746186546637, "grad_norm": 0.9370831251144409, "learning_rate": 2e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6236, "tokens_per_second_per_gpu": 10162.45, "total_tokens": 615630654 }, { "epoch": 0.3899099774943736, "grad_norm": 0.9005305171012878, "learning_rate": 2e-05, "loss": 0.6653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6237, "tokens_per_second_per_gpu": 10943.34, "total_tokens": 615729589 }, { "epoch": 0.3899724931232808, "grad_norm": 0.9147433042526245, "learning_rate": 2e-05, "loss": 0.6824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6238, "tokens_per_second_per_gpu": 10387.07, "total_tokens": 615825299 }, { "epoch": 0.39003500875218805, "grad_norm": 0.9135093092918396, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6239, "tokens_per_second_per_gpu": 10975.04, "total_tokens": 615924856 }, { "epoch": 0.3900975243810953, "grad_norm": 0.9127154350280762, "learning_rate": 2e-05, "loss": 0.691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6240, "tokens_per_second_per_gpu": 11089.83, "total_tokens": 616025232 }, { "epoch": 0.3901600400100025, "grad_norm": 0.866161048412323, "learning_rate": 2e-05, "loss": 0.6338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6241, "tokens_per_second_per_gpu": 10423.94, "total_tokens": 616120462 }, { "epoch": 0.39022255563890973, "grad_norm": 0.9082821607589722, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6242, "tokens_per_second_per_gpu": 9597.7, "total_tokens": 616219044 }, { "epoch": 0.390285071267817, "grad_norm": 0.8926596641540527, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6243, "tokens_per_second_per_gpu": 9770.51, "total_tokens": 616314292 }, { "epoch": 0.39034758689672416, "grad_norm": 0.8548202514648438, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6244, "tokens_per_second_per_gpu": 11057.04, "total_tokens": 616416571 }, { "epoch": 0.3904101025256314, "grad_norm": 0.8895544409751892, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6245, "tokens_per_second_per_gpu": 10590.94, "total_tokens": 616519976 }, { "epoch": 0.39047261815453865, "grad_norm": 0.903544545173645, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6246, "tokens_per_second_per_gpu": 10417.85, "total_tokens": 616618033 }, { "epoch": 0.39053513378344584, "grad_norm": 0.9005745053291321, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6247, "tokens_per_second_per_gpu": 9904.03, "total_tokens": 616716648 }, { "epoch": 0.3905976494123531, "grad_norm": 0.8743372559547424, "learning_rate": 2e-05, "loss": 0.6526, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6248, "tokens_per_second_per_gpu": 10915.33, "total_tokens": 616816270 }, { "epoch": 0.39066016504126033, "grad_norm": 0.8786919116973877, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6249, "tokens_per_second_per_gpu": 10025.91, "total_tokens": 616912760 }, { "epoch": 0.3907226806701675, "grad_norm": 0.9112949967384338, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6250, "tokens_per_second_per_gpu": 10227.92, "total_tokens": 617010458 }, { "epoch": 0.39078519629907477, "grad_norm": 0.8665185570716858, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6251, "tokens_per_second_per_gpu": 10869.28, "total_tokens": 617108972 }, { "epoch": 0.390847711927982, "grad_norm": 0.8757531046867371, "learning_rate": 2e-05, "loss": 0.656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6252, "tokens_per_second_per_gpu": 11162.7, "total_tokens": 617210094 }, { "epoch": 0.3909102275568892, "grad_norm": 0.8524587750434875, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6253, "tokens_per_second_per_gpu": 10609.08, "total_tokens": 617312764 }, { "epoch": 0.39097274318579645, "grad_norm": 0.8513942360877991, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6254, "tokens_per_second_per_gpu": 11022.95, "total_tokens": 617414958 }, { "epoch": 0.3910352588147037, "grad_norm": 0.9060888290405273, "learning_rate": 2e-05, "loss": 0.6758, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6255, "tokens_per_second_per_gpu": 10961.13, "total_tokens": 617513952 }, { "epoch": 0.3910977744436109, "grad_norm": 0.8780406713485718, "learning_rate": 2e-05, "loss": 0.6954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6256, "tokens_per_second_per_gpu": 10588.2, "total_tokens": 617615943 }, { "epoch": 0.3911602900725181, "grad_norm": 0.893024742603302, "learning_rate": 2e-05, "loss": 0.6891, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6257, "tokens_per_second_per_gpu": 10487.08, "total_tokens": 617713713 }, { "epoch": 0.39122280570142537, "grad_norm": 0.8663656711578369, "learning_rate": 2e-05, "loss": 0.6455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6258, "tokens_per_second_per_gpu": 10373.95, "total_tokens": 617813221 }, { "epoch": 0.39128532133033256, "grad_norm": 0.930950939655304, "learning_rate": 2e-05, "loss": 0.6906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6259, "tokens_per_second_per_gpu": 10633.72, "total_tokens": 617911570 }, { "epoch": 0.3913478369592398, "grad_norm": 0.8628442883491516, "learning_rate": 2e-05, "loss": 0.6668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6260, "tokens_per_second_per_gpu": 11294.27, "total_tokens": 618017231 }, { "epoch": 0.39141035258814705, "grad_norm": 0.8985891342163086, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6261, "tokens_per_second_per_gpu": 11152.25, "total_tokens": 618118773 }, { "epoch": 0.39147286821705424, "grad_norm": 0.900892972946167, "learning_rate": 2e-05, "loss": 0.6849, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6262, "tokens_per_second_per_gpu": 10879.3, "total_tokens": 618218973 }, { "epoch": 0.3915353838459615, "grad_norm": 0.897515058517456, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6263, "tokens_per_second_per_gpu": 9997.0, "total_tokens": 618314821 }, { "epoch": 0.3915978994748687, "grad_norm": 0.8853706121444702, "learning_rate": 2e-05, "loss": 0.6793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6264, "tokens_per_second_per_gpu": 10802.46, "total_tokens": 618417270 }, { "epoch": 0.3916604151037759, "grad_norm": 0.9124217629432678, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6265, "tokens_per_second_per_gpu": 10977.74, "total_tokens": 618516082 }, { "epoch": 0.39172293073268316, "grad_norm": 0.8736255764961243, "learning_rate": 2e-05, "loss": 0.671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6266, "tokens_per_second_per_gpu": 10898.45, "total_tokens": 618617027 }, { "epoch": 0.3917854463615904, "grad_norm": 0.8531877994537354, "learning_rate": 2e-05, "loss": 0.6636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6267, "tokens_per_second_per_gpu": 10673.28, "total_tokens": 618717376 }, { "epoch": 0.39184796199049765, "grad_norm": 0.8960951566696167, "learning_rate": 2e-05, "loss": 0.6495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6268, "tokens_per_second_per_gpu": 10694.18, "total_tokens": 618813355 }, { "epoch": 0.39191047761940484, "grad_norm": 0.900255560874939, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6269, "tokens_per_second_per_gpu": 10286.31, "total_tokens": 618908704 }, { "epoch": 0.3919729932483121, "grad_norm": 0.9908862113952637, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6270, "tokens_per_second_per_gpu": 10351.39, "total_tokens": 619009368 }, { "epoch": 0.39203550887721933, "grad_norm": 0.9486058354377747, "learning_rate": 2e-05, "loss": 0.6814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6271, "tokens_per_second_per_gpu": 10462.2, "total_tokens": 619108598 }, { "epoch": 0.3920980245061265, "grad_norm": 0.9063660502433777, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6272, "tokens_per_second_per_gpu": 10427.86, "total_tokens": 619208094 }, { "epoch": 0.39216054013503376, "grad_norm": 0.890885055065155, "learning_rate": 2e-05, "loss": 0.6516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6273, "tokens_per_second_per_gpu": 10267.23, "total_tokens": 619305115 }, { "epoch": 0.392223055763941, "grad_norm": 0.9374485611915588, "learning_rate": 2e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6274, "tokens_per_second_per_gpu": 10265.71, "total_tokens": 619402002 }, { "epoch": 0.3922855713928482, "grad_norm": 0.868686318397522, "learning_rate": 2e-05, "loss": 0.668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6275, "tokens_per_second_per_gpu": 10904.61, "total_tokens": 619503780 }, { "epoch": 0.39234808702175544, "grad_norm": 0.8966872096061707, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6276, "tokens_per_second_per_gpu": 10497.32, "total_tokens": 619599766 }, { "epoch": 0.3924106026506627, "grad_norm": 0.8827066421508789, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6277, "tokens_per_second_per_gpu": 10875.79, "total_tokens": 619696152 }, { "epoch": 0.3924731182795699, "grad_norm": 0.9040972590446472, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6278, "tokens_per_second_per_gpu": 11643.03, "total_tokens": 619797521 }, { "epoch": 0.3925356339084771, "grad_norm": 0.9262390732765198, "learning_rate": 2e-05, "loss": 0.6856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6279, "tokens_per_second_per_gpu": 10279.81, "total_tokens": 619899652 }, { "epoch": 0.39259814953738437, "grad_norm": 0.8807494044303894, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6280, "tokens_per_second_per_gpu": 10336.8, "total_tokens": 619999328 }, { "epoch": 0.39266066516629156, "grad_norm": 0.9062567949295044, "learning_rate": 2e-05, "loss": 0.6511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6281, "tokens_per_second_per_gpu": 10375.97, "total_tokens": 620100775 }, { "epoch": 0.3927231807951988, "grad_norm": 0.8895746469497681, "learning_rate": 2e-05, "loss": 0.6655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6282, "tokens_per_second_per_gpu": 9764.24, "total_tokens": 620198514 }, { "epoch": 0.39278569642410605, "grad_norm": 0.8771294951438904, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6283, "tokens_per_second_per_gpu": 11226.07, "total_tokens": 620301749 }, { "epoch": 0.39284821205301323, "grad_norm": 0.8666313290596008, "learning_rate": 2e-05, "loss": 0.5915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6284, "tokens_per_second_per_gpu": 10694.84, "total_tokens": 620400804 }, { "epoch": 0.3929107276819205, "grad_norm": 0.8887696862220764, "learning_rate": 2e-05, "loss": 0.6672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6285, "tokens_per_second_per_gpu": 10768.79, "total_tokens": 620500584 }, { "epoch": 0.3929732433108277, "grad_norm": 0.8844718933105469, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6286, "tokens_per_second_per_gpu": 10162.42, "total_tokens": 620601451 }, { "epoch": 0.3930357589397349, "grad_norm": 0.9197860360145569, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6287, "tokens_per_second_per_gpu": 10806.48, "total_tokens": 620696018 }, { "epoch": 0.39309827456864216, "grad_norm": 0.8738242983818054, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6288, "tokens_per_second_per_gpu": 10644.73, "total_tokens": 620794231 }, { "epoch": 0.3931607901975494, "grad_norm": 0.8528106212615967, "learning_rate": 2e-05, "loss": 0.6548, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6289, "tokens_per_second_per_gpu": 10762.5, "total_tokens": 620893945 }, { "epoch": 0.3932233058264566, "grad_norm": 1.0606496334075928, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6290, "tokens_per_second_per_gpu": 9654.04, "total_tokens": 620984227 }, { "epoch": 0.39328582145536384, "grad_norm": 0.9183599948883057, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6291, "tokens_per_second_per_gpu": 10511.05, "total_tokens": 621082479 }, { "epoch": 0.3933483370842711, "grad_norm": 0.9120555520057678, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6292, "tokens_per_second_per_gpu": 10515.82, "total_tokens": 621179636 }, { "epoch": 0.39341085271317827, "grad_norm": 0.9077968001365662, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6293, "tokens_per_second_per_gpu": 9879.31, "total_tokens": 621273113 }, { "epoch": 0.3934733683420855, "grad_norm": 0.9309224486351013, "learning_rate": 2e-05, "loss": 0.6716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6294, "tokens_per_second_per_gpu": 10510.98, "total_tokens": 621372766 }, { "epoch": 0.39353588397099276, "grad_norm": 0.875147819519043, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6295, "tokens_per_second_per_gpu": 10562.48, "total_tokens": 621471316 }, { "epoch": 0.39359839959989995, "grad_norm": 0.9358242750167847, "learning_rate": 2e-05, "loss": 0.6734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6296, "tokens_per_second_per_gpu": 10471.32, "total_tokens": 621572518 }, { "epoch": 0.3936609152288072, "grad_norm": 0.9500766396522522, "learning_rate": 2e-05, "loss": 0.691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6297, "tokens_per_second_per_gpu": 10608.89, "total_tokens": 621671673 }, { "epoch": 0.39372343085771444, "grad_norm": 0.8733509182929993, "learning_rate": 2e-05, "loss": 0.65, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6298, "tokens_per_second_per_gpu": 10863.51, "total_tokens": 621769975 }, { "epoch": 0.39378594648662163, "grad_norm": 0.9032453298568726, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6299, "tokens_per_second_per_gpu": 10822.41, "total_tokens": 621868988 }, { "epoch": 0.3938484621155289, "grad_norm": 0.9275427460670471, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6300, "tokens_per_second_per_gpu": 10806.18, "total_tokens": 621970074 }, { "epoch": 0.3939109777444361, "grad_norm": 0.9411045908927917, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6301, "tokens_per_second_per_gpu": 10458.93, "total_tokens": 622071265 }, { "epoch": 0.3939734933733433, "grad_norm": 0.9251176118850708, "learning_rate": 2e-05, "loss": 0.6788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6302, "tokens_per_second_per_gpu": 10142.67, "total_tokens": 622166300 }, { "epoch": 0.39403600900225055, "grad_norm": 0.8971709609031677, "learning_rate": 2e-05, "loss": 0.7154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6303, "tokens_per_second_per_gpu": 10316.01, "total_tokens": 622267606 }, { "epoch": 0.3940985246311578, "grad_norm": 0.875199019908905, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6304, "tokens_per_second_per_gpu": 10819.54, "total_tokens": 622371056 }, { "epoch": 0.39416104026006504, "grad_norm": 0.9462149143218994, "learning_rate": 2e-05, "loss": 0.6741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6305, "tokens_per_second_per_gpu": 10560.75, "total_tokens": 622472231 }, { "epoch": 0.39422355588897223, "grad_norm": 0.9777393341064453, "learning_rate": 2e-05, "loss": 0.6636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6306, "tokens_per_second_per_gpu": 10744.17, "total_tokens": 622575072 }, { "epoch": 0.3942860715178795, "grad_norm": 0.9625337719917297, "learning_rate": 2e-05, "loss": 0.6929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6307, "tokens_per_second_per_gpu": 10711.51, "total_tokens": 622674094 }, { "epoch": 0.3943485871467867, "grad_norm": 0.8959611058235168, "learning_rate": 2e-05, "loss": 0.6425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6308, "tokens_per_second_per_gpu": 9408.2, "total_tokens": 622767965 }, { "epoch": 0.3944111027756939, "grad_norm": 0.8845866918563843, "learning_rate": 2e-05, "loss": 0.6145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6309, "tokens_per_second_per_gpu": 9424.25, "total_tokens": 622864272 }, { "epoch": 0.39447361840460116, "grad_norm": 0.8702070116996765, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6310, "tokens_per_second_per_gpu": 11163.66, "total_tokens": 622963246 }, { "epoch": 0.3945361340335084, "grad_norm": 0.9169362783432007, "learning_rate": 2e-05, "loss": 0.6867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6311, "tokens_per_second_per_gpu": 11708.23, "total_tokens": 623062349 }, { "epoch": 0.3945986496624156, "grad_norm": 0.9054948091506958, "learning_rate": 2e-05, "loss": 0.6598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6312, "tokens_per_second_per_gpu": 10787.63, "total_tokens": 623160585 }, { "epoch": 0.39466116529132284, "grad_norm": 0.8884334564208984, "learning_rate": 2e-05, "loss": 0.6823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6313, "tokens_per_second_per_gpu": 10626.0, "total_tokens": 623261656 }, { "epoch": 0.3947236809202301, "grad_norm": 0.8620137572288513, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6314, "tokens_per_second_per_gpu": 11010.37, "total_tokens": 623361589 }, { "epoch": 0.39478619654913727, "grad_norm": 0.9098958969116211, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6315, "tokens_per_second_per_gpu": 10149.6, "total_tokens": 623460076 }, { "epoch": 0.3948487121780445, "grad_norm": 0.8989899158477783, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6316, "tokens_per_second_per_gpu": 10230.68, "total_tokens": 623556934 }, { "epoch": 0.39491122780695176, "grad_norm": 0.9370908141136169, "learning_rate": 2e-05, "loss": 0.6906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6317, "tokens_per_second_per_gpu": 10451.98, "total_tokens": 623657317 }, { "epoch": 0.39497374343585895, "grad_norm": 0.899834156036377, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6318, "tokens_per_second_per_gpu": 10755.94, "total_tokens": 623755518 }, { "epoch": 0.3950362590647662, "grad_norm": 0.8768506646156311, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6319, "tokens_per_second_per_gpu": 10913.1, "total_tokens": 623854061 }, { "epoch": 0.39509877469367344, "grad_norm": 0.9061285853385925, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6320, "tokens_per_second_per_gpu": 10497.78, "total_tokens": 623950591 }, { "epoch": 0.3951612903225806, "grad_norm": 0.8909785747528076, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6321, "tokens_per_second_per_gpu": 10608.34, "total_tokens": 624050513 }, { "epoch": 0.39522380595148787, "grad_norm": 0.8773221969604492, "learning_rate": 2e-05, "loss": 0.7027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6322, "tokens_per_second_per_gpu": 10581.48, "total_tokens": 624152745 }, { "epoch": 0.3952863215803951, "grad_norm": 0.9042537212371826, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6323, "tokens_per_second_per_gpu": 9980.16, "total_tokens": 624251147 }, { "epoch": 0.3953488372093023, "grad_norm": 0.8974682688713074, "learning_rate": 2e-05, "loss": 0.6457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6324, "tokens_per_second_per_gpu": 10375.75, "total_tokens": 624351980 }, { "epoch": 0.39541135283820955, "grad_norm": 0.861996054649353, "learning_rate": 2e-05, "loss": 0.664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6325, "tokens_per_second_per_gpu": 11720.94, "total_tokens": 624454894 }, { "epoch": 0.3954738684671168, "grad_norm": 0.8340627551078796, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6326, "tokens_per_second_per_gpu": 10680.05, "total_tokens": 624553533 }, { "epoch": 0.395536384096024, "grad_norm": 0.8887754678726196, "learning_rate": 2e-05, "loss": 0.7093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6327, "tokens_per_second_per_gpu": 10052.49, "total_tokens": 624651012 }, { "epoch": 0.39559889972493123, "grad_norm": 0.8815964460372925, "learning_rate": 2e-05, "loss": 0.6844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6328, "tokens_per_second_per_gpu": 11198.54, "total_tokens": 624750855 }, { "epoch": 0.3956614153538385, "grad_norm": 0.8845426440238953, "learning_rate": 2e-05, "loss": 0.6325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6329, "tokens_per_second_per_gpu": 10832.9, "total_tokens": 624851134 }, { "epoch": 0.39572393098274566, "grad_norm": 0.8448297381401062, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6330, "tokens_per_second_per_gpu": 11054.74, "total_tokens": 624953225 }, { "epoch": 0.3957864466116529, "grad_norm": 0.9450991153717041, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6331, "tokens_per_second_per_gpu": 10299.48, "total_tokens": 625048628 }, { "epoch": 0.39584896224056015, "grad_norm": 0.8772203922271729, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6332, "tokens_per_second_per_gpu": 11093.94, "total_tokens": 625147785 }, { "epoch": 0.39591147786946734, "grad_norm": 0.879176914691925, "learning_rate": 2e-05, "loss": 0.6713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6333, "tokens_per_second_per_gpu": 11502.86, "total_tokens": 625250675 }, { "epoch": 0.3959739934983746, "grad_norm": 0.9307541251182556, "learning_rate": 2e-05, "loss": 0.7015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6334, "tokens_per_second_per_gpu": 10333.99, "total_tokens": 625351622 }, { "epoch": 0.39603650912728183, "grad_norm": 0.886127769947052, "learning_rate": 2e-05, "loss": 0.6971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6335, "tokens_per_second_per_gpu": 10385.87, "total_tokens": 625452893 }, { "epoch": 0.396099024756189, "grad_norm": 0.8774137496948242, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6336, "tokens_per_second_per_gpu": 10037.71, "total_tokens": 625552520 }, { "epoch": 0.39616154038509627, "grad_norm": 0.8692478537559509, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6337, "tokens_per_second_per_gpu": 10538.22, "total_tokens": 625651764 }, { "epoch": 0.3962240560140035, "grad_norm": 0.9237654805183411, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6338, "tokens_per_second_per_gpu": 10663.82, "total_tokens": 625749430 }, { "epoch": 0.3962865716429107, "grad_norm": 0.9659417867660522, "learning_rate": 2e-05, "loss": 0.68, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6339, "tokens_per_second_per_gpu": 10178.14, "total_tokens": 625843283 }, { "epoch": 0.39634908727181795, "grad_norm": 0.8528451323509216, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6340, "tokens_per_second_per_gpu": 10820.4, "total_tokens": 625944713 }, { "epoch": 0.3964116029007252, "grad_norm": 0.8920726776123047, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6341, "tokens_per_second_per_gpu": 11061.82, "total_tokens": 626048903 }, { "epoch": 0.39647411852963244, "grad_norm": 0.8922575116157532, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6342, "tokens_per_second_per_gpu": 8887.36, "total_tokens": 626143635 }, { "epoch": 0.3965366341585396, "grad_norm": 0.914158284664154, "learning_rate": 2e-05, "loss": 0.7068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6343, "tokens_per_second_per_gpu": 10645.31, "total_tokens": 626242690 }, { "epoch": 0.39659914978744687, "grad_norm": 0.8834595084190369, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6344, "tokens_per_second_per_gpu": 11141.4, "total_tokens": 626343597 }, { "epoch": 0.3966616654163541, "grad_norm": 0.8849765062332153, "learning_rate": 2e-05, "loss": 0.6754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6345, "tokens_per_second_per_gpu": 10989.13, "total_tokens": 626444889 }, { "epoch": 0.3967241810452613, "grad_norm": 0.8981336951255798, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6346, "tokens_per_second_per_gpu": 9999.79, "total_tokens": 626539424 }, { "epoch": 0.39678669667416855, "grad_norm": 0.8950278162956238, "learning_rate": 2e-05, "loss": 0.6551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6347, "tokens_per_second_per_gpu": 10738.38, "total_tokens": 626639788 }, { "epoch": 0.3968492123030758, "grad_norm": 0.9245796203613281, "learning_rate": 2e-05, "loss": 0.6807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6348, "tokens_per_second_per_gpu": 9785.69, "total_tokens": 626737560 }, { "epoch": 0.396911727931983, "grad_norm": 0.9141952395439148, "learning_rate": 2e-05, "loss": 0.6482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6349, "tokens_per_second_per_gpu": 10241.95, "total_tokens": 626832646 }, { "epoch": 0.3969742435608902, "grad_norm": 0.9163078665733337, "learning_rate": 2e-05, "loss": 0.6704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6350, "tokens_per_second_per_gpu": 9543.87, "total_tokens": 626929535 }, { "epoch": 0.3970367591897975, "grad_norm": 0.8742533922195435, "learning_rate": 2e-05, "loss": 0.6489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6351, "tokens_per_second_per_gpu": 9898.76, "total_tokens": 627026680 }, { "epoch": 0.39709927481870466, "grad_norm": 0.8634387254714966, "learning_rate": 2e-05, "loss": 0.6734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6352, "tokens_per_second_per_gpu": 10710.78, "total_tokens": 627129497 }, { "epoch": 0.3971617904476119, "grad_norm": 0.8571876883506775, "learning_rate": 2e-05, "loss": 0.6775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6353, "tokens_per_second_per_gpu": 10592.02, "total_tokens": 627230242 }, { "epoch": 0.39722430607651915, "grad_norm": 0.8889204859733582, "learning_rate": 2e-05, "loss": 0.6633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6354, "tokens_per_second_per_gpu": 11112.15, "total_tokens": 627329634 }, { "epoch": 0.39728682170542634, "grad_norm": 0.8677504658699036, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6355, "tokens_per_second_per_gpu": 10951.82, "total_tokens": 627431052 }, { "epoch": 0.3973493373343336, "grad_norm": 0.8567488789558411, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6356, "tokens_per_second_per_gpu": 9871.16, "total_tokens": 627529372 }, { "epoch": 0.39741185296324083, "grad_norm": 0.8979851007461548, "learning_rate": 2e-05, "loss": 0.6999, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6357, "tokens_per_second_per_gpu": 10459.25, "total_tokens": 627629026 }, { "epoch": 0.397474368592148, "grad_norm": 0.8891056180000305, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6358, "tokens_per_second_per_gpu": 10860.2, "total_tokens": 627731659 }, { "epoch": 0.39753688422105526, "grad_norm": 0.9142078757286072, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6359, "tokens_per_second_per_gpu": 10814.97, "total_tokens": 627835918 }, { "epoch": 0.3975993998499625, "grad_norm": 0.8842129111289978, "learning_rate": 2e-05, "loss": 0.6759, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6360, "tokens_per_second_per_gpu": 10024.68, "total_tokens": 627936585 }, { "epoch": 0.3976619154788697, "grad_norm": 0.8442328572273254, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6361, "tokens_per_second_per_gpu": 10002.49, "total_tokens": 628034985 }, { "epoch": 0.39772443110777694, "grad_norm": 0.9099855422973633, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6362, "tokens_per_second_per_gpu": 10050.2, "total_tokens": 628130861 }, { "epoch": 0.3977869467366842, "grad_norm": 0.8989598751068115, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6363, "tokens_per_second_per_gpu": 11201.46, "total_tokens": 628233354 }, { "epoch": 0.3978494623655914, "grad_norm": 0.9134648442268372, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6364, "tokens_per_second_per_gpu": 10946.66, "total_tokens": 628332528 }, { "epoch": 0.3979119779944986, "grad_norm": 0.9016058444976807, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6365, "tokens_per_second_per_gpu": 10240.53, "total_tokens": 628429827 }, { "epoch": 0.39797449362340587, "grad_norm": 0.9533506035804749, "learning_rate": 2e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6366, "tokens_per_second_per_gpu": 10419.61, "total_tokens": 628532573 }, { "epoch": 0.39803700925231306, "grad_norm": 0.8630259037017822, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6367, "tokens_per_second_per_gpu": 10938.78, "total_tokens": 628633136 }, { "epoch": 0.3980995248812203, "grad_norm": 0.862456202507019, "learning_rate": 2e-05, "loss": 0.6013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6368, "tokens_per_second_per_gpu": 10390.68, "total_tokens": 628731834 }, { "epoch": 0.39816204051012755, "grad_norm": 0.9021812677383423, "learning_rate": 2e-05, "loss": 0.6826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6369, "tokens_per_second_per_gpu": 10874.07, "total_tokens": 628834089 }, { "epoch": 0.39822455613903474, "grad_norm": 0.8459353446960449, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6370, "tokens_per_second_per_gpu": 11011.55, "total_tokens": 628935525 }, { "epoch": 0.398287071767942, "grad_norm": 0.9309459924697876, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6371, "tokens_per_second_per_gpu": 11111.15, "total_tokens": 629034617 }, { "epoch": 0.3983495873968492, "grad_norm": 0.8768822550773621, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6372, "tokens_per_second_per_gpu": 11409.41, "total_tokens": 629136592 }, { "epoch": 0.3984121030257564, "grad_norm": 0.9172950983047485, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6373, "tokens_per_second_per_gpu": 9795.07, "total_tokens": 629234057 }, { "epoch": 0.39847461865466366, "grad_norm": 0.8747600317001343, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6374, "tokens_per_second_per_gpu": 10518.89, "total_tokens": 629331116 }, { "epoch": 0.3985371342835709, "grad_norm": 0.8823167085647583, "learning_rate": 2e-05, "loss": 0.7083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6375, "tokens_per_second_per_gpu": 10776.14, "total_tokens": 629434328 }, { "epoch": 0.3985996499124781, "grad_norm": 0.8890892863273621, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6376, "tokens_per_second_per_gpu": 9431.51, "total_tokens": 629528854 }, { "epoch": 0.39866216554138534, "grad_norm": 0.8661918640136719, "learning_rate": 2e-05, "loss": 0.701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6377, "tokens_per_second_per_gpu": 10356.74, "total_tokens": 629629194 }, { "epoch": 0.3987246811702926, "grad_norm": 0.8608404994010925, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6378, "tokens_per_second_per_gpu": 10141.53, "total_tokens": 629724624 }, { "epoch": 0.3987871967991998, "grad_norm": 0.9193806052207947, "learning_rate": 2e-05, "loss": 0.7264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6379, "tokens_per_second_per_gpu": 10577.52, "total_tokens": 629822648 }, { "epoch": 0.398849712428107, "grad_norm": 0.8608404994010925, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6380, "tokens_per_second_per_gpu": 10660.53, "total_tokens": 629920594 }, { "epoch": 0.39891222805701426, "grad_norm": 0.9226787686347961, "learning_rate": 2e-05, "loss": 0.6768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6381, "tokens_per_second_per_gpu": 10206.38, "total_tokens": 630019506 }, { "epoch": 0.3989747436859215, "grad_norm": 0.8675191402435303, "learning_rate": 2e-05, "loss": 0.6255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6382, "tokens_per_second_per_gpu": 10771.08, "total_tokens": 630117725 }, { "epoch": 0.3990372593148287, "grad_norm": 0.911499559879303, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6383, "tokens_per_second_per_gpu": 10214.64, "total_tokens": 630215729 }, { "epoch": 0.39909977494373594, "grad_norm": 0.8733128905296326, "learning_rate": 2e-05, "loss": 0.6634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6384, "tokens_per_second_per_gpu": 10357.56, "total_tokens": 630313493 }, { "epoch": 0.3991622905726432, "grad_norm": 0.8836804628372192, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6385, "tokens_per_second_per_gpu": 10111.15, "total_tokens": 630412187 }, { "epoch": 0.3992248062015504, "grad_norm": 0.8969137668609619, "learning_rate": 2e-05, "loss": 0.6593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6386, "tokens_per_second_per_gpu": 10374.1, "total_tokens": 630514102 }, { "epoch": 0.3992873218304576, "grad_norm": 0.8737624883651733, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6387, "tokens_per_second_per_gpu": 11427.21, "total_tokens": 630615570 }, { "epoch": 0.39934983745936486, "grad_norm": 0.8914030194282532, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6388, "tokens_per_second_per_gpu": 10345.06, "total_tokens": 630711391 }, { "epoch": 0.39941235308827205, "grad_norm": 0.8865117430686951, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6389, "tokens_per_second_per_gpu": 10650.27, "total_tokens": 630811560 }, { "epoch": 0.3994748687171793, "grad_norm": 0.8893089890480042, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6390, "tokens_per_second_per_gpu": 10761.38, "total_tokens": 630910310 }, { "epoch": 0.39953738434608654, "grad_norm": 0.862541675567627, "learning_rate": 2e-05, "loss": 0.7088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6391, "tokens_per_second_per_gpu": 11548.03, "total_tokens": 631014241 }, { "epoch": 0.39959989997499373, "grad_norm": 0.8727346062660217, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6392, "tokens_per_second_per_gpu": 10201.27, "total_tokens": 631111644 }, { "epoch": 0.399662415603901, "grad_norm": 0.9489003419876099, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6393, "tokens_per_second_per_gpu": 10270.0, "total_tokens": 631210604 }, { "epoch": 0.3997249312328082, "grad_norm": 0.9029864072799683, "learning_rate": 2e-05, "loss": 0.6318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6394, "tokens_per_second_per_gpu": 10610.32, "total_tokens": 631307280 }, { "epoch": 0.3997874468617154, "grad_norm": 0.873636782169342, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6395, "tokens_per_second_per_gpu": 10537.96, "total_tokens": 631405953 }, { "epoch": 0.39984996249062266, "grad_norm": 0.9088065028190613, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6396, "tokens_per_second_per_gpu": 10849.26, "total_tokens": 631509195 }, { "epoch": 0.3999124781195299, "grad_norm": 0.8541581630706787, "learning_rate": 2e-05, "loss": 0.6209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6397, "tokens_per_second_per_gpu": 10809.53, "total_tokens": 631609507 }, { "epoch": 0.3999749937484371, "grad_norm": 0.9185670614242554, "learning_rate": 2e-05, "loss": 0.7169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6398, "tokens_per_second_per_gpu": 10422.09, "total_tokens": 631711458 }, { "epoch": 0.40003750937734434, "grad_norm": 0.8947533369064331, "learning_rate": 2e-05, "loss": 0.7098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6399, "tokens_per_second_per_gpu": 11215.4, "total_tokens": 631811647 }, { "epoch": 0.4001000250062516, "grad_norm": 0.9076290130615234, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6400, "tokens_per_second_per_gpu": 9877.36, "total_tokens": 631907608 }, { "epoch": 0.40016254063515877, "grad_norm": 0.9346937537193298, "learning_rate": 2e-05, "loss": 0.6702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6401, "tokens_per_second_per_gpu": 10089.78, "total_tokens": 632006920 }, { "epoch": 0.400225056264066, "grad_norm": 0.9084065556526184, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6402, "tokens_per_second_per_gpu": 10882.63, "total_tokens": 632106742 }, { "epoch": 0.40028757189297326, "grad_norm": 0.9100580215454102, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6403, "tokens_per_second_per_gpu": 10763.0, "total_tokens": 632206085 }, { "epoch": 0.40035008752188045, "grad_norm": 0.9282833337783813, "learning_rate": 2e-05, "loss": 0.6654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6404, "tokens_per_second_per_gpu": 10507.28, "total_tokens": 632304266 }, { "epoch": 0.4004126031507877, "grad_norm": 0.908450722694397, "learning_rate": 2e-05, "loss": 0.6692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6405, "tokens_per_second_per_gpu": 10381.84, "total_tokens": 632403845 }, { "epoch": 0.40047511877969494, "grad_norm": 0.8907320499420166, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6406, "tokens_per_second_per_gpu": 10926.81, "total_tokens": 632502541 }, { "epoch": 0.40053763440860213, "grad_norm": 0.9340217113494873, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6407, "tokens_per_second_per_gpu": 10767.96, "total_tokens": 632600254 }, { "epoch": 0.4006001500375094, "grad_norm": 0.827702522277832, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6408, "tokens_per_second_per_gpu": 10530.37, "total_tokens": 632699895 }, { "epoch": 0.4006626656664166, "grad_norm": 0.8600909113883972, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6409, "tokens_per_second_per_gpu": 9992.74, "total_tokens": 632797016 }, { "epoch": 0.4007251812953238, "grad_norm": 0.9213150143623352, "learning_rate": 2e-05, "loss": 0.7015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6410, "tokens_per_second_per_gpu": 10761.04, "total_tokens": 632895134 }, { "epoch": 0.40078769692423105, "grad_norm": 0.8912473320960999, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6411, "tokens_per_second_per_gpu": 9652.48, "total_tokens": 632993628 }, { "epoch": 0.4008502125531383, "grad_norm": 0.8870776295661926, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6412, "tokens_per_second_per_gpu": 10836.94, "total_tokens": 633092617 }, { "epoch": 0.4009127281820455, "grad_norm": 0.8604024648666382, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6413, "tokens_per_second_per_gpu": 10846.72, "total_tokens": 633192261 }, { "epoch": 0.40097524381095273, "grad_norm": 0.8816990852355957, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6414, "tokens_per_second_per_gpu": 10814.13, "total_tokens": 633292670 }, { "epoch": 0.40103775943986, "grad_norm": 0.9026371836662292, "learning_rate": 2e-05, "loss": 0.6486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6415, "tokens_per_second_per_gpu": 11291.22, "total_tokens": 633395278 }, { "epoch": 0.40110027506876716, "grad_norm": 0.9521036744117737, "learning_rate": 2e-05, "loss": 0.6659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6416, "tokens_per_second_per_gpu": 13831.2, "total_tokens": 633497274 }, { "epoch": 0.4011627906976744, "grad_norm": 0.9186599254608154, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6417, "tokens_per_second_per_gpu": 10973.82, "total_tokens": 633596853 }, { "epoch": 0.40122530632658165, "grad_norm": 0.894843339920044, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6418, "tokens_per_second_per_gpu": 10781.94, "total_tokens": 633698460 }, { "epoch": 0.4012878219554889, "grad_norm": 0.9078201651573181, "learning_rate": 2e-05, "loss": 0.7068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6419, "tokens_per_second_per_gpu": 10878.15, "total_tokens": 633799104 }, { "epoch": 0.4013503375843961, "grad_norm": 0.8784686326980591, "learning_rate": 2e-05, "loss": 0.6593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6420, "tokens_per_second_per_gpu": 10788.87, "total_tokens": 633901714 }, { "epoch": 0.40141285321330333, "grad_norm": 0.8703371286392212, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6421, "tokens_per_second_per_gpu": 10453.74, "total_tokens": 634003415 }, { "epoch": 0.4014753688422106, "grad_norm": 0.8810948729515076, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6422, "tokens_per_second_per_gpu": 10143.12, "total_tokens": 634103137 }, { "epoch": 0.40153788447111777, "grad_norm": 0.9202737212181091, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6423, "tokens_per_second_per_gpu": 10446.12, "total_tokens": 634204577 }, { "epoch": 0.401600400100025, "grad_norm": 0.8890969157218933, "learning_rate": 2e-05, "loss": 0.7359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6424, "tokens_per_second_per_gpu": 10988.0, "total_tokens": 634306574 }, { "epoch": 0.40166291572893226, "grad_norm": 0.9106869697570801, "learning_rate": 2e-05, "loss": 0.724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6425, "tokens_per_second_per_gpu": 10527.84, "total_tokens": 634407911 }, { "epoch": 0.40172543135783945, "grad_norm": 0.8982693552970886, "learning_rate": 2e-05, "loss": 0.6574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6426, "tokens_per_second_per_gpu": 10749.21, "total_tokens": 634506117 }, { "epoch": 0.4017879469867467, "grad_norm": 0.8878635764122009, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6427, "tokens_per_second_per_gpu": 10896.11, "total_tokens": 634605284 }, { "epoch": 0.40185046261565394, "grad_norm": 0.918398380279541, "learning_rate": 2e-05, "loss": 0.7022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6428, "tokens_per_second_per_gpu": 10827.26, "total_tokens": 634706627 }, { "epoch": 0.4019129782445611, "grad_norm": 0.8900378942489624, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6429, "tokens_per_second_per_gpu": 9450.91, "total_tokens": 634801064 }, { "epoch": 0.40197549387346837, "grad_norm": 0.8834217190742493, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6430, "tokens_per_second_per_gpu": 9994.59, "total_tokens": 634894851 }, { "epoch": 0.4020380095023756, "grad_norm": 0.9336239099502563, "learning_rate": 2e-05, "loss": 0.6406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6431, "tokens_per_second_per_gpu": 10541.49, "total_tokens": 634990700 }, { "epoch": 0.4021005251312828, "grad_norm": 0.8725531697273254, "learning_rate": 2e-05, "loss": 0.6551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6432, "tokens_per_second_per_gpu": 11246.27, "total_tokens": 635093463 }, { "epoch": 0.40216304076019005, "grad_norm": 0.8830455541610718, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6433, "tokens_per_second_per_gpu": 10188.69, "total_tokens": 635188703 }, { "epoch": 0.4022255563890973, "grad_norm": 0.8848308324813843, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6434, "tokens_per_second_per_gpu": 10625.47, "total_tokens": 635290031 }, { "epoch": 0.4022880720180045, "grad_norm": 0.9111016392707825, "learning_rate": 2e-05, "loss": 0.6683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6435, "tokens_per_second_per_gpu": 10502.92, "total_tokens": 635389462 }, { "epoch": 0.40235058764691173, "grad_norm": 0.8880135416984558, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6436, "tokens_per_second_per_gpu": 10733.04, "total_tokens": 635488003 }, { "epoch": 0.402413103275819, "grad_norm": 0.8887292742729187, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6437, "tokens_per_second_per_gpu": 11087.35, "total_tokens": 635586480 }, { "epoch": 0.40247561890472616, "grad_norm": 0.8732839226722717, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6438, "tokens_per_second_per_gpu": 10570.47, "total_tokens": 635686843 }, { "epoch": 0.4025381345336334, "grad_norm": 0.91091388463974, "learning_rate": 2e-05, "loss": 0.6959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6439, "tokens_per_second_per_gpu": 11146.36, "total_tokens": 635784159 }, { "epoch": 0.40260065016254065, "grad_norm": 0.9416198134422302, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6440, "tokens_per_second_per_gpu": 10025.42, "total_tokens": 635881827 }, { "epoch": 0.40266316579144784, "grad_norm": 0.8942423462867737, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6441, "tokens_per_second_per_gpu": 11079.54, "total_tokens": 635984785 }, { "epoch": 0.4027256814203551, "grad_norm": 0.8850153684616089, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6442, "tokens_per_second_per_gpu": 10337.65, "total_tokens": 636085353 }, { "epoch": 0.40278819704926233, "grad_norm": 0.9066522121429443, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6443, "tokens_per_second_per_gpu": 10123.72, "total_tokens": 636181268 }, { "epoch": 0.4028507126781695, "grad_norm": 0.8676299452781677, "learning_rate": 2e-05, "loss": 0.6318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6444, "tokens_per_second_per_gpu": 10932.76, "total_tokens": 636282396 }, { "epoch": 0.40291322830707677, "grad_norm": 0.8925849795341492, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6445, "tokens_per_second_per_gpu": 10750.82, "total_tokens": 636381869 }, { "epoch": 0.402975743935984, "grad_norm": 0.8506793975830078, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6446, "tokens_per_second_per_gpu": 10691.47, "total_tokens": 636483072 }, { "epoch": 0.4030382595648912, "grad_norm": 0.9042147397994995, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6447, "tokens_per_second_per_gpu": 10303.64, "total_tokens": 636582363 }, { "epoch": 0.40310077519379844, "grad_norm": 0.907819390296936, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6448, "tokens_per_second_per_gpu": 10103.86, "total_tokens": 636677199 }, { "epoch": 0.4031632908227057, "grad_norm": 0.9070379734039307, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6449, "tokens_per_second_per_gpu": 10044.76, "total_tokens": 636778298 }, { "epoch": 0.4032258064516129, "grad_norm": 0.9351245164871216, "learning_rate": 2e-05, "loss": 0.7045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6450, "tokens_per_second_per_gpu": 10065.81, "total_tokens": 636876983 }, { "epoch": 0.4032883220805201, "grad_norm": 0.8792728781700134, "learning_rate": 2e-05, "loss": 0.6836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6451, "tokens_per_second_per_gpu": 10446.95, "total_tokens": 636978314 }, { "epoch": 0.40335083770942737, "grad_norm": 0.9622794389724731, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6452, "tokens_per_second_per_gpu": 10824.27, "total_tokens": 637074770 }, { "epoch": 0.40341335333833456, "grad_norm": 0.9160621762275696, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6453, "tokens_per_second_per_gpu": 10283.64, "total_tokens": 637171347 }, { "epoch": 0.4034758689672418, "grad_norm": 0.9224494099617004, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6454, "tokens_per_second_per_gpu": 11276.15, "total_tokens": 637271026 }, { "epoch": 0.40353838459614905, "grad_norm": 0.9484525918960571, "learning_rate": 2e-05, "loss": 0.6633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6455, "tokens_per_second_per_gpu": 11284.39, "total_tokens": 637374935 }, { "epoch": 0.4036009002250563, "grad_norm": 0.8889931440353394, "learning_rate": 2e-05, "loss": 0.5845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6456, "tokens_per_second_per_gpu": 10485.17, "total_tokens": 637472555 }, { "epoch": 0.4036634158539635, "grad_norm": 0.9399808049201965, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6457, "tokens_per_second_per_gpu": 11373.36, "total_tokens": 637572141 }, { "epoch": 0.4037259314828707, "grad_norm": 0.9250860214233398, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6458, "tokens_per_second_per_gpu": 10529.43, "total_tokens": 637670361 }, { "epoch": 0.40378844711177797, "grad_norm": 0.8731564283370972, "learning_rate": 2e-05, "loss": 0.6522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6459, "tokens_per_second_per_gpu": 10608.49, "total_tokens": 637770453 }, { "epoch": 0.40385096274068516, "grad_norm": 0.9990071058273315, "learning_rate": 2e-05, "loss": 0.6652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6460, "tokens_per_second_per_gpu": 9520.03, "total_tokens": 637864440 }, { "epoch": 0.4039134783695924, "grad_norm": 0.8553634881973267, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6461, "tokens_per_second_per_gpu": 10739.23, "total_tokens": 637963982 }, { "epoch": 0.40397599399849965, "grad_norm": 0.8777015209197998, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6462, "tokens_per_second_per_gpu": 10092.95, "total_tokens": 638061456 }, { "epoch": 0.40403850962740684, "grad_norm": 0.8914607167243958, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6463, "tokens_per_second_per_gpu": 10384.89, "total_tokens": 638157916 }, { "epoch": 0.4041010252563141, "grad_norm": 0.8892972469329834, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6464, "tokens_per_second_per_gpu": 10002.18, "total_tokens": 638256177 }, { "epoch": 0.40416354088522133, "grad_norm": 0.9157319664955139, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6465, "tokens_per_second_per_gpu": 9601.28, "total_tokens": 638350730 }, { "epoch": 0.4042260565141285, "grad_norm": 0.9110360741615295, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6466, "tokens_per_second_per_gpu": 11112.04, "total_tokens": 638453952 }, { "epoch": 0.40428857214303576, "grad_norm": 0.9449720978736877, "learning_rate": 2e-05, "loss": 0.6483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6467, "tokens_per_second_per_gpu": 9746.55, "total_tokens": 638550982 }, { "epoch": 0.404351087771943, "grad_norm": 0.858580470085144, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6468, "tokens_per_second_per_gpu": 10247.97, "total_tokens": 638654972 }, { "epoch": 0.4044136034008502, "grad_norm": 0.8932615518569946, "learning_rate": 2e-05, "loss": 0.6984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6469, "tokens_per_second_per_gpu": 10809.71, "total_tokens": 638756050 }, { "epoch": 0.40447611902975744, "grad_norm": 0.9306156635284424, "learning_rate": 2e-05, "loss": 0.6317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6470, "tokens_per_second_per_gpu": 10865.87, "total_tokens": 638856382 }, { "epoch": 0.4045386346586647, "grad_norm": 0.8847694993019104, "learning_rate": 2e-05, "loss": 0.657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6471, "tokens_per_second_per_gpu": 10564.56, "total_tokens": 638956886 }, { "epoch": 0.4046011502875719, "grad_norm": 0.9207212924957275, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6472, "tokens_per_second_per_gpu": 10514.99, "total_tokens": 639049866 }, { "epoch": 0.4046636659164791, "grad_norm": 0.8777482509613037, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6473, "tokens_per_second_per_gpu": 9874.59, "total_tokens": 639146687 }, { "epoch": 0.40472618154538637, "grad_norm": 0.8794594407081604, "learning_rate": 2e-05, "loss": 0.6538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6474, "tokens_per_second_per_gpu": 11288.16, "total_tokens": 639247829 }, { "epoch": 0.40478869717429355, "grad_norm": 0.8961839079856873, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6475, "tokens_per_second_per_gpu": 10707.32, "total_tokens": 639344181 }, { "epoch": 0.4048512128032008, "grad_norm": 0.928610622882843, "learning_rate": 2e-05, "loss": 0.6733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6476, "tokens_per_second_per_gpu": 10789.97, "total_tokens": 639443779 }, { "epoch": 0.40491372843210804, "grad_norm": 0.9346580505371094, "learning_rate": 2e-05, "loss": 0.6834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6477, "tokens_per_second_per_gpu": 10433.42, "total_tokens": 639545696 }, { "epoch": 0.40497624406101523, "grad_norm": 0.88739013671875, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6478, "tokens_per_second_per_gpu": 11602.33, "total_tokens": 639649587 }, { "epoch": 0.4050387596899225, "grad_norm": 0.8959905505180359, "learning_rate": 2e-05, "loss": 0.6612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6479, "tokens_per_second_per_gpu": 10400.86, "total_tokens": 639745862 }, { "epoch": 0.4051012753188297, "grad_norm": 0.8968198299407959, "learning_rate": 2e-05, "loss": 0.6598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6480, "tokens_per_second_per_gpu": 10749.68, "total_tokens": 639844866 }, { "epoch": 0.4051637909477369, "grad_norm": 0.8910381197929382, "learning_rate": 2e-05, "loss": 0.6794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6481, "tokens_per_second_per_gpu": 10662.03, "total_tokens": 639946434 }, { "epoch": 0.40522630657664416, "grad_norm": 0.9855452179908752, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6482, "tokens_per_second_per_gpu": 10532.85, "total_tokens": 640047605 }, { "epoch": 0.4052888222055514, "grad_norm": 0.9336967468261719, "learning_rate": 2e-05, "loss": 0.6875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6483, "tokens_per_second_per_gpu": 10201.84, "total_tokens": 640147586 }, { "epoch": 0.4053513378344586, "grad_norm": 0.8803712129592896, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6484, "tokens_per_second_per_gpu": 10767.83, "total_tokens": 640247240 }, { "epoch": 0.40541385346336584, "grad_norm": 0.9275263547897339, "learning_rate": 2e-05, "loss": 0.663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6485, "tokens_per_second_per_gpu": 10932.51, "total_tokens": 640349365 }, { "epoch": 0.4054763690922731, "grad_norm": 0.9445832967758179, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6486, "tokens_per_second_per_gpu": 10546.17, "total_tokens": 640453141 }, { "epoch": 0.40553888472118027, "grad_norm": 0.9262361526489258, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6487, "tokens_per_second_per_gpu": 9613.86, "total_tokens": 640550561 }, { "epoch": 0.4056014003500875, "grad_norm": 0.8628849983215332, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6488, "tokens_per_second_per_gpu": 10138.35, "total_tokens": 640649640 }, { "epoch": 0.40566391597899476, "grad_norm": 0.8756942749023438, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6489, "tokens_per_second_per_gpu": 10982.29, "total_tokens": 640750292 }, { "epoch": 0.40572643160790195, "grad_norm": 0.9475703835487366, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6490, "tokens_per_second_per_gpu": 10820.37, "total_tokens": 640848858 }, { "epoch": 0.4057889472368092, "grad_norm": 0.8951167464256287, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6491, "tokens_per_second_per_gpu": 10682.53, "total_tokens": 640949902 }, { "epoch": 0.40585146286571644, "grad_norm": 0.9017933011054993, "learning_rate": 2e-05, "loss": 0.65, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6492, "tokens_per_second_per_gpu": 10919.01, "total_tokens": 641049719 }, { "epoch": 0.40591397849462363, "grad_norm": 0.8902245759963989, "learning_rate": 2e-05, "loss": 0.6829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6493, "tokens_per_second_per_gpu": 10733.91, "total_tokens": 641149816 }, { "epoch": 0.4059764941235309, "grad_norm": 0.919399619102478, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6494, "tokens_per_second_per_gpu": 10072.69, "total_tokens": 641244145 }, { "epoch": 0.4060390097524381, "grad_norm": 0.9166715741157532, "learning_rate": 2e-05, "loss": 0.7447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6495, "tokens_per_second_per_gpu": 9926.19, "total_tokens": 641341845 }, { "epoch": 0.40610152538134536, "grad_norm": 0.9022083282470703, "learning_rate": 2e-05, "loss": 0.6827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6496, "tokens_per_second_per_gpu": 10830.67, "total_tokens": 641438108 }, { "epoch": 0.40616404101025255, "grad_norm": 0.9073604941368103, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6497, "tokens_per_second_per_gpu": 10798.75, "total_tokens": 641532465 }, { "epoch": 0.4062265566391598, "grad_norm": 0.8581514954566956, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6498, "tokens_per_second_per_gpu": 11114.74, "total_tokens": 641633164 }, { "epoch": 0.40628907226806704, "grad_norm": 0.9344154596328735, "learning_rate": 2e-05, "loss": 0.6669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6499, "tokens_per_second_per_gpu": 10224.88, "total_tokens": 641730762 }, { "epoch": 0.40635158789697423, "grad_norm": 0.9151906967163086, "learning_rate": 2e-05, "loss": 0.6679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6500, "tokens_per_second_per_gpu": 10268.35, "total_tokens": 641827449 }, { "epoch": 0.4064141035258815, "grad_norm": 0.8927214741706848, "learning_rate": 2e-05, "loss": 0.6758, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6501, "tokens_per_second_per_gpu": 11020.8, "total_tokens": 641929485 }, { "epoch": 0.4064766191547887, "grad_norm": 0.9624180197715759, "learning_rate": 2e-05, "loss": 0.6728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6502, "tokens_per_second_per_gpu": 9374.29, "total_tokens": 642021144 }, { "epoch": 0.4065391347836959, "grad_norm": 0.8900631666183472, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6503, "tokens_per_second_per_gpu": 10999.46, "total_tokens": 642122479 }, { "epoch": 0.40660165041260315, "grad_norm": 0.8790299296379089, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6504, "tokens_per_second_per_gpu": 10117.86, "total_tokens": 642218806 }, { "epoch": 0.4066641660415104, "grad_norm": 0.9062291383743286, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6505, "tokens_per_second_per_gpu": 10592.39, "total_tokens": 642315768 }, { "epoch": 0.4067266816704176, "grad_norm": 0.8852295875549316, "learning_rate": 2e-05, "loss": 0.6718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6506, "tokens_per_second_per_gpu": 10654.81, "total_tokens": 642418450 }, { "epoch": 0.40678919729932483, "grad_norm": 0.9011371731758118, "learning_rate": 2e-05, "loss": 0.6817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6507, "tokens_per_second_per_gpu": 10186.05, "total_tokens": 642517353 }, { "epoch": 0.4068517129282321, "grad_norm": 0.859589159488678, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6508, "tokens_per_second_per_gpu": 10606.52, "total_tokens": 642619070 }, { "epoch": 0.40691422855713927, "grad_norm": 0.855344295501709, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6509, "tokens_per_second_per_gpu": 10246.02, "total_tokens": 642722394 }, { "epoch": 0.4069767441860465, "grad_norm": 0.9199565649032593, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6510, "tokens_per_second_per_gpu": 9893.38, "total_tokens": 642820407 }, { "epoch": 0.40703925981495376, "grad_norm": 0.868447482585907, "learning_rate": 2e-05, "loss": 0.6566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6511, "tokens_per_second_per_gpu": 10675.48, "total_tokens": 642920497 }, { "epoch": 0.40710177544386095, "grad_norm": 0.8949889540672302, "learning_rate": 2e-05, "loss": 0.6462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6512, "tokens_per_second_per_gpu": 10840.68, "total_tokens": 643020142 }, { "epoch": 0.4071642910727682, "grad_norm": 0.882319986820221, "learning_rate": 2e-05, "loss": 0.6946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6513, "tokens_per_second_per_gpu": 10918.01, "total_tokens": 643120512 }, { "epoch": 0.40722680670167544, "grad_norm": 0.9031293392181396, "learning_rate": 2e-05, "loss": 0.6458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6514, "tokens_per_second_per_gpu": 8533.93, "total_tokens": 643213743 }, { "epoch": 0.4072893223305826, "grad_norm": 0.8610808849334717, "learning_rate": 2e-05, "loss": 0.668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6515, "tokens_per_second_per_gpu": 11242.04, "total_tokens": 643316034 }, { "epoch": 0.40735183795948987, "grad_norm": 0.9066763520240784, "learning_rate": 2e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6516, "tokens_per_second_per_gpu": 10446.09, "total_tokens": 643409893 }, { "epoch": 0.4074143535883971, "grad_norm": 0.8850124478340149, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6517, "tokens_per_second_per_gpu": 10401.35, "total_tokens": 643510734 }, { "epoch": 0.4074768692173043, "grad_norm": 0.8698787093162537, "learning_rate": 2e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6518, "tokens_per_second_per_gpu": 10609.75, "total_tokens": 643608153 }, { "epoch": 0.40753938484621155, "grad_norm": 0.9026893973350525, "learning_rate": 2e-05, "loss": 0.6752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6519, "tokens_per_second_per_gpu": 10977.9, "total_tokens": 643709440 }, { "epoch": 0.4076019004751188, "grad_norm": 0.8680838942527771, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6520, "tokens_per_second_per_gpu": 11228.23, "total_tokens": 643811690 }, { "epoch": 0.407664416104026, "grad_norm": 0.8950705528259277, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6521, "tokens_per_second_per_gpu": 10661.65, "total_tokens": 643909175 }, { "epoch": 0.40772693173293323, "grad_norm": 0.8990964293479919, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6522, "tokens_per_second_per_gpu": 10194.22, "total_tokens": 644008266 }, { "epoch": 0.4077894473618405, "grad_norm": 0.8899368643760681, "learning_rate": 2e-05, "loss": 0.6457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6523, "tokens_per_second_per_gpu": 10101.44, "total_tokens": 644105297 }, { "epoch": 0.40785196299074766, "grad_norm": 0.8764702081680298, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6524, "tokens_per_second_per_gpu": 10468.79, "total_tokens": 644207323 }, { "epoch": 0.4079144786196549, "grad_norm": 0.8983905911445618, "learning_rate": 2e-05, "loss": 0.6703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6525, "tokens_per_second_per_gpu": 10607.06, "total_tokens": 644308258 }, { "epoch": 0.40797699424856215, "grad_norm": 0.8982749581336975, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6526, "tokens_per_second_per_gpu": 10699.62, "total_tokens": 644403388 }, { "epoch": 0.40803950987746934, "grad_norm": 0.8940973281860352, "learning_rate": 2e-05, "loss": 0.7175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6527, "tokens_per_second_per_gpu": 11641.11, "total_tokens": 644506555 }, { "epoch": 0.4081020255063766, "grad_norm": 0.8935333490371704, "learning_rate": 2e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6528, "tokens_per_second_per_gpu": 10500.73, "total_tokens": 644606335 }, { "epoch": 0.40816454113528383, "grad_norm": 0.9205946326255798, "learning_rate": 2e-05, "loss": 0.6867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6529, "tokens_per_second_per_gpu": 10919.07, "total_tokens": 644706921 }, { "epoch": 0.408227056764191, "grad_norm": 0.8366912603378296, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6530, "tokens_per_second_per_gpu": 10156.8, "total_tokens": 644807001 }, { "epoch": 0.40828957239309827, "grad_norm": 0.8585115671157837, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6531, "tokens_per_second_per_gpu": 10781.28, "total_tokens": 644908807 }, { "epoch": 0.4083520880220055, "grad_norm": 0.9219505786895752, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6532, "tokens_per_second_per_gpu": 10605.79, "total_tokens": 645007157 }, { "epoch": 0.40841460365091276, "grad_norm": 0.9173984527587891, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6533, "tokens_per_second_per_gpu": 10890.0, "total_tokens": 645106557 }, { "epoch": 0.40847711927981994, "grad_norm": 0.8793484568595886, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6534, "tokens_per_second_per_gpu": 10888.92, "total_tokens": 645209215 }, { "epoch": 0.4085396349087272, "grad_norm": 0.9000608921051025, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6535, "tokens_per_second_per_gpu": 10395.56, "total_tokens": 645309717 }, { "epoch": 0.40860215053763443, "grad_norm": 0.8797548413276672, "learning_rate": 2e-05, "loss": 0.6753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6536, "tokens_per_second_per_gpu": 10756.85, "total_tokens": 645411501 }, { "epoch": 0.4086646661665416, "grad_norm": 0.9376277327537537, "learning_rate": 2e-05, "loss": 0.6534, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6537, "tokens_per_second_per_gpu": 10199.73, "total_tokens": 645512911 }, { "epoch": 0.40872718179544887, "grad_norm": 0.9751906991004944, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6538, "tokens_per_second_per_gpu": 10188.73, "total_tokens": 645606834 }, { "epoch": 0.4087896974243561, "grad_norm": 0.9598580002784729, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6539, "tokens_per_second_per_gpu": 10899.33, "total_tokens": 645707485 }, { "epoch": 0.4088522130532633, "grad_norm": 0.8642105460166931, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6540, "tokens_per_second_per_gpu": 10871.44, "total_tokens": 645806422 }, { "epoch": 0.40891472868217055, "grad_norm": 0.9184350371360779, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6541, "tokens_per_second_per_gpu": 10288.53, "total_tokens": 645904837 }, { "epoch": 0.4089772443110778, "grad_norm": 0.8664337396621704, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6542, "tokens_per_second_per_gpu": 11244.76, "total_tokens": 646008283 }, { "epoch": 0.409039759939985, "grad_norm": 0.9109747409820557, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6543, "tokens_per_second_per_gpu": 10405.24, "total_tokens": 646108781 }, { "epoch": 0.4091022755688922, "grad_norm": 0.8799367547035217, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6544, "tokens_per_second_per_gpu": 10827.41, "total_tokens": 646214251 }, { "epoch": 0.40916479119779947, "grad_norm": 0.9217072129249573, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6545, "tokens_per_second_per_gpu": 10251.02, "total_tokens": 646312501 }, { "epoch": 0.40922730682670666, "grad_norm": 0.8604180216789246, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6546, "tokens_per_second_per_gpu": 9926.6, "total_tokens": 646407160 }, { "epoch": 0.4092898224556139, "grad_norm": 0.902129590511322, "learning_rate": 2e-05, "loss": 0.6738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6547, "tokens_per_second_per_gpu": 10709.55, "total_tokens": 646506289 }, { "epoch": 0.40935233808452115, "grad_norm": 0.9673002362251282, "learning_rate": 2e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6548, "tokens_per_second_per_gpu": 11079.76, "total_tokens": 646604852 }, { "epoch": 0.40941485371342834, "grad_norm": 0.9153210520744324, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6549, "tokens_per_second_per_gpu": 9448.92, "total_tokens": 646698463 }, { "epoch": 0.4094773693423356, "grad_norm": 0.9032054543495178, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6550, "tokens_per_second_per_gpu": 9904.2, "total_tokens": 646795020 }, { "epoch": 0.40953988497124283, "grad_norm": 0.9134637713432312, "learning_rate": 2e-05, "loss": 0.7098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6551, "tokens_per_second_per_gpu": 10978.68, "total_tokens": 646893384 }, { "epoch": 0.40960240060015, "grad_norm": 0.878343403339386, "learning_rate": 2e-05, "loss": 0.5864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6552, "tokens_per_second_per_gpu": 10650.88, "total_tokens": 646989886 }, { "epoch": 0.40966491622905726, "grad_norm": 0.8625624179840088, "learning_rate": 2e-05, "loss": 0.6219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6553, "tokens_per_second_per_gpu": 10589.76, "total_tokens": 647086914 }, { "epoch": 0.4097274318579645, "grad_norm": 0.9073010087013245, "learning_rate": 2e-05, "loss": 0.6462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6554, "tokens_per_second_per_gpu": 9440.61, "total_tokens": 647180971 }, { "epoch": 0.4097899474868717, "grad_norm": 0.8698256611824036, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6555, "tokens_per_second_per_gpu": 10601.2, "total_tokens": 647279955 }, { "epoch": 0.40985246311577894, "grad_norm": 0.8774018287658691, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6556, "tokens_per_second_per_gpu": 10975.81, "total_tokens": 647378524 }, { "epoch": 0.4099149787446862, "grad_norm": 0.8590852618217468, "learning_rate": 2e-05, "loss": 0.6942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6557, "tokens_per_second_per_gpu": 10687.91, "total_tokens": 647479533 }, { "epoch": 0.4099774943735934, "grad_norm": 0.9209967255592346, "learning_rate": 2e-05, "loss": 0.6613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6558, "tokens_per_second_per_gpu": 11341.74, "total_tokens": 647581211 }, { "epoch": 0.4100400100025006, "grad_norm": 0.8359355330467224, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6559, "tokens_per_second_per_gpu": 10781.8, "total_tokens": 647679385 }, { "epoch": 0.41010252563140787, "grad_norm": 0.9013577699661255, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6560, "tokens_per_second_per_gpu": 9611.44, "total_tokens": 647772078 }, { "epoch": 0.41016504126031506, "grad_norm": 0.8787969350814819, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6561, "tokens_per_second_per_gpu": 9933.13, "total_tokens": 647870074 }, { "epoch": 0.4102275568892223, "grad_norm": 0.9090985655784607, "learning_rate": 2e-05, "loss": 0.6805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6562, "tokens_per_second_per_gpu": 11066.69, "total_tokens": 647969741 }, { "epoch": 0.41029007251812954, "grad_norm": 0.9238561391830444, "learning_rate": 2e-05, "loss": 0.6406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6563, "tokens_per_second_per_gpu": 11022.03, "total_tokens": 648069431 }, { "epoch": 0.41035258814703673, "grad_norm": 0.9179244041442871, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6564, "tokens_per_second_per_gpu": 10481.65, "total_tokens": 648166245 }, { "epoch": 0.410415103775944, "grad_norm": 0.8783592581748962, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6565, "tokens_per_second_per_gpu": 10728.7, "total_tokens": 648266489 }, { "epoch": 0.4104776194048512, "grad_norm": 0.9076442122459412, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6566, "tokens_per_second_per_gpu": 9419.94, "total_tokens": 648362312 }, { "epoch": 0.4105401350337584, "grad_norm": 0.9152358174324036, "learning_rate": 2e-05, "loss": 0.6431, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6567, "tokens_per_second_per_gpu": 10966.25, "total_tokens": 648460735 }, { "epoch": 0.41060265066266566, "grad_norm": 0.9110146760940552, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6568, "tokens_per_second_per_gpu": 10636.63, "total_tokens": 648562768 }, { "epoch": 0.4106651662915729, "grad_norm": 0.9267656207084656, "learning_rate": 2e-05, "loss": 0.6581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6569, "tokens_per_second_per_gpu": 10615.82, "total_tokens": 648662847 }, { "epoch": 0.41072768192048015, "grad_norm": 0.9958627223968506, "learning_rate": 2e-05, "loss": 0.6602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6570, "tokens_per_second_per_gpu": 10039.05, "total_tokens": 648758864 }, { "epoch": 0.41079019754938734, "grad_norm": 0.9194303750991821, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6571, "tokens_per_second_per_gpu": 10449.14, "total_tokens": 648853836 }, { "epoch": 0.4108527131782946, "grad_norm": 0.898607075214386, "learning_rate": 2e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6572, "tokens_per_second_per_gpu": 10507.22, "total_tokens": 648955096 }, { "epoch": 0.4109152288072018, "grad_norm": 0.9552650451660156, "learning_rate": 2e-05, "loss": 0.7009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6573, "tokens_per_second_per_gpu": 10202.75, "total_tokens": 649054918 }, { "epoch": 0.410977744436109, "grad_norm": 1.0181339979171753, "learning_rate": 2e-05, "loss": 0.6085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6574, "tokens_per_second_per_gpu": 10838.89, "total_tokens": 649150004 }, { "epoch": 0.41104026006501626, "grad_norm": 0.8643632531166077, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6575, "tokens_per_second_per_gpu": 10875.47, "total_tokens": 649251336 }, { "epoch": 0.4111027756939235, "grad_norm": 0.8811123967170715, "learning_rate": 2e-05, "loss": 0.6754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6576, "tokens_per_second_per_gpu": 10378.12, "total_tokens": 649354995 }, { "epoch": 0.4111652913228307, "grad_norm": 0.9189615249633789, "learning_rate": 2e-05, "loss": 0.7358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6577, "tokens_per_second_per_gpu": 10638.35, "total_tokens": 649452931 }, { "epoch": 0.41122780695173794, "grad_norm": 0.8421486020088196, "learning_rate": 2e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6578, "tokens_per_second_per_gpu": 11251.82, "total_tokens": 649552797 }, { "epoch": 0.4112903225806452, "grad_norm": 0.9139798879623413, "learning_rate": 2e-05, "loss": 0.6759, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6579, "tokens_per_second_per_gpu": 10978.7, "total_tokens": 649654236 }, { "epoch": 0.4113528382095524, "grad_norm": 0.9006572365760803, "learning_rate": 2e-05, "loss": 0.657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6580, "tokens_per_second_per_gpu": 10473.36, "total_tokens": 649754107 }, { "epoch": 0.4114153538384596, "grad_norm": 0.9050447940826416, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6581, "tokens_per_second_per_gpu": 11135.03, "total_tokens": 649853868 }, { "epoch": 0.41147786946736686, "grad_norm": 0.8897993564605713, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6582, "tokens_per_second_per_gpu": 10022.29, "total_tokens": 649949492 }, { "epoch": 0.41154038509627405, "grad_norm": 0.9296534657478333, "learning_rate": 2e-05, "loss": 0.7421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6583, "tokens_per_second_per_gpu": 10817.33, "total_tokens": 650053444 }, { "epoch": 0.4116029007251813, "grad_norm": 0.8818507194519043, "learning_rate": 2e-05, "loss": 0.6824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6584, "tokens_per_second_per_gpu": 10236.24, "total_tokens": 650156664 }, { "epoch": 0.41166541635408854, "grad_norm": 0.9212092757225037, "learning_rate": 2e-05, "loss": 0.6858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6585, "tokens_per_second_per_gpu": 11047.54, "total_tokens": 650254671 }, { "epoch": 0.41172793198299573, "grad_norm": 0.9018164277076721, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6586, "tokens_per_second_per_gpu": 10509.76, "total_tokens": 650352763 }, { "epoch": 0.411790447611903, "grad_norm": 0.8901646137237549, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6587, "tokens_per_second_per_gpu": 10075.28, "total_tokens": 650451345 }, { "epoch": 0.4118529632408102, "grad_norm": 0.8926506042480469, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6588, "tokens_per_second_per_gpu": 10751.72, "total_tokens": 650551869 }, { "epoch": 0.4119154788697174, "grad_norm": 0.9109511375427246, "learning_rate": 2e-05, "loss": 0.648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6589, "tokens_per_second_per_gpu": 10541.33, "total_tokens": 650650061 }, { "epoch": 0.41197799449862466, "grad_norm": 0.8819789290428162, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6590, "tokens_per_second_per_gpu": 10586.85, "total_tokens": 650750820 }, { "epoch": 0.4120405101275319, "grad_norm": 0.8925759792327881, "learning_rate": 2e-05, "loss": 0.6672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6591, "tokens_per_second_per_gpu": 10632.71, "total_tokens": 650846789 }, { "epoch": 0.4121030257564391, "grad_norm": 0.8796306252479553, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6592, "tokens_per_second_per_gpu": 10248.81, "total_tokens": 650942201 }, { "epoch": 0.41216554138534633, "grad_norm": 0.8806234002113342, "learning_rate": 2e-05, "loss": 0.6511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6593, "tokens_per_second_per_gpu": 10989.53, "total_tokens": 651046719 }, { "epoch": 0.4122280570142536, "grad_norm": 0.9559330940246582, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6594, "tokens_per_second_per_gpu": 11134.33, "total_tokens": 651148536 }, { "epoch": 0.41229057264316077, "grad_norm": 0.8640850186347961, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6595, "tokens_per_second_per_gpu": 10607.93, "total_tokens": 651247405 }, { "epoch": 0.412353088272068, "grad_norm": 0.8681162595748901, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6596, "tokens_per_second_per_gpu": 9688.0, "total_tokens": 651342218 }, { "epoch": 0.41241560390097526, "grad_norm": 0.9123163223266602, "learning_rate": 2e-05, "loss": 0.6657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6597, "tokens_per_second_per_gpu": 10094.61, "total_tokens": 651440616 }, { "epoch": 0.41247811952988245, "grad_norm": 0.9231603741645813, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6598, "tokens_per_second_per_gpu": 10373.89, "total_tokens": 651537483 }, { "epoch": 0.4125406351587897, "grad_norm": 0.9152154326438904, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6599, "tokens_per_second_per_gpu": 10762.66, "total_tokens": 651637654 }, { "epoch": 0.41260315078769694, "grad_norm": 0.8960404396057129, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6600, "tokens_per_second_per_gpu": 10392.52, "total_tokens": 651733097 }, { "epoch": 0.4126656664166041, "grad_norm": 0.8997039794921875, "learning_rate": 2e-05, "loss": 0.6929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6601, "tokens_per_second_per_gpu": 10730.06, "total_tokens": 651832598 }, { "epoch": 0.41272818204551137, "grad_norm": 0.8729366064071655, "learning_rate": 2e-05, "loss": 0.6742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6602, "tokens_per_second_per_gpu": 10624.62, "total_tokens": 651936194 }, { "epoch": 0.4127906976744186, "grad_norm": 0.8994916677474976, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6603, "tokens_per_second_per_gpu": 10246.9, "total_tokens": 652033816 }, { "epoch": 0.4128532133033258, "grad_norm": 0.8320327997207642, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6604, "tokens_per_second_per_gpu": 10927.1, "total_tokens": 652135242 }, { "epoch": 0.41291572893223305, "grad_norm": 0.8936772346496582, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6605, "tokens_per_second_per_gpu": 10432.5, "total_tokens": 652234044 }, { "epoch": 0.4129782445611403, "grad_norm": 0.8682926297187805, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6606, "tokens_per_second_per_gpu": 10883.72, "total_tokens": 652334496 }, { "epoch": 0.4130407601900475, "grad_norm": 0.9004305601119995, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6607, "tokens_per_second_per_gpu": 10808.72, "total_tokens": 652435998 }, { "epoch": 0.41310327581895473, "grad_norm": 0.8961554765701294, "learning_rate": 2e-05, "loss": 0.6947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6608, "tokens_per_second_per_gpu": 9817.97, "total_tokens": 652533306 }, { "epoch": 0.413165791447862, "grad_norm": 0.9410560727119446, "learning_rate": 2e-05, "loss": 0.7377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6609, "tokens_per_second_per_gpu": 10312.6, "total_tokens": 652632611 }, { "epoch": 0.4132283070767692, "grad_norm": 0.8857280611991882, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6610, "tokens_per_second_per_gpu": 10491.39, "total_tokens": 652728412 }, { "epoch": 0.4132908227056764, "grad_norm": 0.8891586065292358, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6611, "tokens_per_second_per_gpu": 9939.45, "total_tokens": 652827127 }, { "epoch": 0.41335333833458365, "grad_norm": 0.8604357242584229, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6612, "tokens_per_second_per_gpu": 11587.45, "total_tokens": 652927808 }, { "epoch": 0.4134158539634909, "grad_norm": 1.1117966175079346, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6613, "tokens_per_second_per_gpu": 9949.0, "total_tokens": 653025002 }, { "epoch": 0.4134783695923981, "grad_norm": 0.8744484782218933, "learning_rate": 2e-05, "loss": 0.6724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6614, "tokens_per_second_per_gpu": 11285.42, "total_tokens": 653125088 }, { "epoch": 0.41354088522130533, "grad_norm": 1.0039894580841064, "learning_rate": 2e-05, "loss": 0.6835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6615, "tokens_per_second_per_gpu": 10623.05, "total_tokens": 653223149 }, { "epoch": 0.4136034008502126, "grad_norm": 0.8771858811378479, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6616, "tokens_per_second_per_gpu": 11461.71, "total_tokens": 653326292 }, { "epoch": 0.41366591647911977, "grad_norm": 0.9523047208786011, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6617, "tokens_per_second_per_gpu": 11101.42, "total_tokens": 653422823 }, { "epoch": 0.413728432108027, "grad_norm": 0.9025856852531433, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6618, "tokens_per_second_per_gpu": 10665.52, "total_tokens": 653522795 }, { "epoch": 0.41379094773693426, "grad_norm": 0.8996874094009399, "learning_rate": 2e-05, "loss": 0.6535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6619, "tokens_per_second_per_gpu": 10674.34, "total_tokens": 653625344 }, { "epoch": 0.41385346336584145, "grad_norm": 0.893535315990448, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6620, "tokens_per_second_per_gpu": 11380.2, "total_tokens": 653724710 }, { "epoch": 0.4139159789947487, "grad_norm": 0.9002283811569214, "learning_rate": 2e-05, "loss": 0.672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6621, "tokens_per_second_per_gpu": 10395.84, "total_tokens": 653827089 }, { "epoch": 0.41397849462365593, "grad_norm": 0.9127240180969238, "learning_rate": 2e-05, "loss": 0.6855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6622, "tokens_per_second_per_gpu": 10056.29, "total_tokens": 653925936 }, { "epoch": 0.4140410102525631, "grad_norm": 0.8845983743667603, "learning_rate": 2e-05, "loss": 0.6803, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6623, "tokens_per_second_per_gpu": 11034.37, "total_tokens": 654029588 }, { "epoch": 0.41410352588147037, "grad_norm": 0.9311068654060364, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6624, "tokens_per_second_per_gpu": 10854.52, "total_tokens": 654126898 }, { "epoch": 0.4141660415103776, "grad_norm": 0.8875272870063782, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6625, "tokens_per_second_per_gpu": 10456.8, "total_tokens": 654226476 }, { "epoch": 0.4142285571392848, "grad_norm": 0.8877583742141724, "learning_rate": 2e-05, "loss": 0.6461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6626, "tokens_per_second_per_gpu": 11197.95, "total_tokens": 654325406 }, { "epoch": 0.41429107276819205, "grad_norm": 0.9128864407539368, "learning_rate": 2e-05, "loss": 0.7061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6627, "tokens_per_second_per_gpu": 10481.29, "total_tokens": 654425611 }, { "epoch": 0.4143535883970993, "grad_norm": 0.8930550217628479, "learning_rate": 2e-05, "loss": 0.6516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6628, "tokens_per_second_per_gpu": 10418.42, "total_tokens": 654525015 }, { "epoch": 0.4144161040260065, "grad_norm": 0.8559902906417847, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6629, "tokens_per_second_per_gpu": 10356.41, "total_tokens": 654623367 }, { "epoch": 0.4144786196549137, "grad_norm": 0.9757600426673889, "learning_rate": 2e-05, "loss": 0.6866, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6630, "tokens_per_second_per_gpu": 10478.36, "total_tokens": 654725339 }, { "epoch": 0.41454113528382097, "grad_norm": 0.9056301116943359, "learning_rate": 2e-05, "loss": 0.6789, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6631, "tokens_per_second_per_gpu": 11227.75, "total_tokens": 654827388 }, { "epoch": 0.41460365091272816, "grad_norm": 0.8448135852813721, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6632, "tokens_per_second_per_gpu": 11028.75, "total_tokens": 654928314 }, { "epoch": 0.4146661665416354, "grad_norm": 0.9380828738212585, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6633, "tokens_per_second_per_gpu": 10460.16, "total_tokens": 655022684 }, { "epoch": 0.41472868217054265, "grad_norm": 0.9046449065208435, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6634, "tokens_per_second_per_gpu": 10872.82, "total_tokens": 655120575 }, { "epoch": 0.41479119779944984, "grad_norm": 0.9336227774620056, "learning_rate": 2e-05, "loss": 0.6781, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6635, "tokens_per_second_per_gpu": 10296.4, "total_tokens": 655221792 }, { "epoch": 0.4148537134283571, "grad_norm": 0.8708928823471069, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6636, "tokens_per_second_per_gpu": 10373.63, "total_tokens": 655321980 }, { "epoch": 0.41491622905726433, "grad_norm": 0.8921635150909424, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6637, "tokens_per_second_per_gpu": 10681.8, "total_tokens": 655419657 }, { "epoch": 0.4149787446861715, "grad_norm": 0.8957867622375488, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6638, "tokens_per_second_per_gpu": 10324.69, "total_tokens": 655519880 }, { "epoch": 0.41504126031507876, "grad_norm": 0.927162766456604, "learning_rate": 2e-05, "loss": 0.6795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6639, "tokens_per_second_per_gpu": 10536.84, "total_tokens": 655620128 }, { "epoch": 0.415103775943986, "grad_norm": 0.8994000554084778, "learning_rate": 2e-05, "loss": 0.6607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6640, "tokens_per_second_per_gpu": 10508.05, "total_tokens": 655722549 }, { "epoch": 0.4151662915728932, "grad_norm": 0.8698331713676453, "learning_rate": 2e-05, "loss": 0.6893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6641, "tokens_per_second_per_gpu": 10925.84, "total_tokens": 655821297 }, { "epoch": 0.41522880720180044, "grad_norm": 0.8855555057525635, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6642, "tokens_per_second_per_gpu": 10770.41, "total_tokens": 655922728 }, { "epoch": 0.4152913228307077, "grad_norm": 0.8893334865570068, "learning_rate": 2e-05, "loss": 0.6461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6643, "tokens_per_second_per_gpu": 10894.99, "total_tokens": 656020601 }, { "epoch": 0.4153538384596149, "grad_norm": 0.9074162244796753, "learning_rate": 2e-05, "loss": 0.6904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6644, "tokens_per_second_per_gpu": 10910.92, "total_tokens": 656122111 }, { "epoch": 0.4154163540885221, "grad_norm": 0.8756157755851746, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6645, "tokens_per_second_per_gpu": 11118.38, "total_tokens": 656224682 }, { "epoch": 0.41547886971742937, "grad_norm": 0.920604944229126, "learning_rate": 2e-05, "loss": 0.6131, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6646, "tokens_per_second_per_gpu": 10056.57, "total_tokens": 656319004 }, { "epoch": 0.4155413853463366, "grad_norm": 0.912166953086853, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6647, "tokens_per_second_per_gpu": 10159.01, "total_tokens": 656418045 }, { "epoch": 0.4156039009752438, "grad_norm": 0.9604987502098083, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6648, "tokens_per_second_per_gpu": 10635.51, "total_tokens": 656517526 }, { "epoch": 0.41566641660415105, "grad_norm": 0.8730772733688354, "learning_rate": 2e-05, "loss": 0.6477, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6649, "tokens_per_second_per_gpu": 10533.06, "total_tokens": 656614952 }, { "epoch": 0.4157289322330583, "grad_norm": 0.8856906890869141, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6650, "tokens_per_second_per_gpu": 10712.05, "total_tokens": 656715528 }, { "epoch": 0.4157914478619655, "grad_norm": 0.9087658524513245, "learning_rate": 2e-05, "loss": 0.6568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6651, "tokens_per_second_per_gpu": 10642.09, "total_tokens": 656814095 }, { "epoch": 0.4158539634908727, "grad_norm": 0.8762233853340149, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6652, "tokens_per_second_per_gpu": 10968.83, "total_tokens": 656917832 }, { "epoch": 0.41591647911977997, "grad_norm": 0.8973501920700073, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6653, "tokens_per_second_per_gpu": 13965.42, "total_tokens": 657014941 }, { "epoch": 0.41597899474868716, "grad_norm": 0.913661539554596, "learning_rate": 2e-05, "loss": 0.6615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6654, "tokens_per_second_per_gpu": 10511.18, "total_tokens": 657112067 }, { "epoch": 0.4160415103775944, "grad_norm": 0.909164309501648, "learning_rate": 2e-05, "loss": 0.7145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6655, "tokens_per_second_per_gpu": 10952.4, "total_tokens": 657211276 }, { "epoch": 0.41610402600650165, "grad_norm": 0.8753188252449036, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6656, "tokens_per_second_per_gpu": 10310.19, "total_tokens": 657312194 }, { "epoch": 0.41616654163540884, "grad_norm": 0.8883915543556213, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6657, "tokens_per_second_per_gpu": 10028.62, "total_tokens": 657412237 }, { "epoch": 0.4162290572643161, "grad_norm": 0.8715337514877319, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6658, "tokens_per_second_per_gpu": 10322.5, "total_tokens": 657513350 }, { "epoch": 0.4162915728932233, "grad_norm": 0.9105421304702759, "learning_rate": 2e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6659, "tokens_per_second_per_gpu": 9743.06, "total_tokens": 657612350 }, { "epoch": 0.4163540885221305, "grad_norm": 0.8799667954444885, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6660, "tokens_per_second_per_gpu": 10404.0, "total_tokens": 657711907 }, { "epoch": 0.41641660415103776, "grad_norm": 0.897607684135437, "learning_rate": 2e-05, "loss": 0.6905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6661, "tokens_per_second_per_gpu": 11732.68, "total_tokens": 657819894 }, { "epoch": 0.416479119779945, "grad_norm": 0.8865646719932556, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6662, "tokens_per_second_per_gpu": 10554.75, "total_tokens": 657915464 }, { "epoch": 0.4165416354088522, "grad_norm": 0.9194815754890442, "learning_rate": 2e-05, "loss": 0.6885, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6663, "tokens_per_second_per_gpu": 9803.64, "total_tokens": 658011820 }, { "epoch": 0.41660415103775944, "grad_norm": 0.8581889867782593, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6664, "tokens_per_second_per_gpu": 10540.1, "total_tokens": 658109175 }, { "epoch": 0.4166666666666667, "grad_norm": 0.8679718971252441, "learning_rate": 2e-05, "loss": 0.6643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6665, "tokens_per_second_per_gpu": 11217.88, "total_tokens": 658214476 }, { "epoch": 0.4167291822955739, "grad_norm": 0.8932499885559082, "learning_rate": 2e-05, "loss": 0.6785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6666, "tokens_per_second_per_gpu": 11357.54, "total_tokens": 658317537 }, { "epoch": 0.4167916979244811, "grad_norm": 0.9800259470939636, "learning_rate": 2e-05, "loss": 0.6431, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6667, "tokens_per_second_per_gpu": 10041.27, "total_tokens": 658413392 }, { "epoch": 0.41685421355338836, "grad_norm": 0.902164101600647, "learning_rate": 2e-05, "loss": 0.6541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6668, "tokens_per_second_per_gpu": 10502.53, "total_tokens": 658510974 }, { "epoch": 0.41691672918229555, "grad_norm": 0.8961362242698669, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6669, "tokens_per_second_per_gpu": 10566.71, "total_tokens": 658605124 }, { "epoch": 0.4169792448112028, "grad_norm": 0.9082238078117371, "learning_rate": 2e-05, "loss": 0.6709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6670, "tokens_per_second_per_gpu": 9890.24, "total_tokens": 658705269 }, { "epoch": 0.41704176044011004, "grad_norm": 0.9634976983070374, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6671, "tokens_per_second_per_gpu": 10995.95, "total_tokens": 658807321 }, { "epoch": 0.41710427606901723, "grad_norm": 0.8991824984550476, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6672, "tokens_per_second_per_gpu": 9859.37, "total_tokens": 658902835 }, { "epoch": 0.4171667916979245, "grad_norm": 0.8682112693786621, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6673, "tokens_per_second_per_gpu": 10434.24, "total_tokens": 659001643 }, { "epoch": 0.4172293073268317, "grad_norm": 0.897205650806427, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6674, "tokens_per_second_per_gpu": 10684.37, "total_tokens": 659099889 }, { "epoch": 0.4172918229557389, "grad_norm": 0.8904445767402649, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6675, "tokens_per_second_per_gpu": 10625.79, "total_tokens": 659201984 }, { "epoch": 0.41735433858464616, "grad_norm": 0.9189885258674622, "learning_rate": 2e-05, "loss": 0.6754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6676, "tokens_per_second_per_gpu": 10862.28, "total_tokens": 659307263 }, { "epoch": 0.4174168542135534, "grad_norm": 0.9505597949028015, "learning_rate": 2e-05, "loss": 0.6591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6677, "tokens_per_second_per_gpu": 9940.81, "total_tokens": 659406149 }, { "epoch": 0.4174793698424606, "grad_norm": 0.8743548393249512, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6678, "tokens_per_second_per_gpu": 10782.37, "total_tokens": 659505278 }, { "epoch": 0.41754188547136784, "grad_norm": 0.9142152070999146, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6679, "tokens_per_second_per_gpu": 10933.29, "total_tokens": 659606117 }, { "epoch": 0.4176044011002751, "grad_norm": 0.9903262257575989, "learning_rate": 2e-05, "loss": 0.6961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6680, "tokens_per_second_per_gpu": 9959.18, "total_tokens": 659697140 }, { "epoch": 0.41766691672918227, "grad_norm": 0.9921224117279053, "learning_rate": 2e-05, "loss": 0.6428, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6681, "tokens_per_second_per_gpu": 10497.21, "total_tokens": 659794781 }, { "epoch": 0.4177294323580895, "grad_norm": 0.8670904636383057, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6682, "tokens_per_second_per_gpu": 10757.36, "total_tokens": 659895727 }, { "epoch": 0.41779194798699676, "grad_norm": 0.8906407356262207, "learning_rate": 2e-05, "loss": 0.6976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6683, "tokens_per_second_per_gpu": 11038.34, "total_tokens": 659998857 }, { "epoch": 0.41785446361590395, "grad_norm": 0.8874369263648987, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6684, "tokens_per_second_per_gpu": 10072.01, "total_tokens": 660096474 }, { "epoch": 0.4179169792448112, "grad_norm": 0.905954897403717, "learning_rate": 2e-05, "loss": 0.674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6685, "tokens_per_second_per_gpu": 10668.02, "total_tokens": 660197691 }, { "epoch": 0.41797949487371844, "grad_norm": 0.920296847820282, "learning_rate": 2e-05, "loss": 0.6943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6686, "tokens_per_second_per_gpu": 9728.46, "total_tokens": 660294108 }, { "epoch": 0.4180420105026257, "grad_norm": 0.9169933795928955, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6687, "tokens_per_second_per_gpu": 10326.64, "total_tokens": 660391217 }, { "epoch": 0.41810452613153287, "grad_norm": 0.8488515019416809, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6688, "tokens_per_second_per_gpu": 10069.61, "total_tokens": 660487966 }, { "epoch": 0.4181670417604401, "grad_norm": 0.9266446828842163, "learning_rate": 2e-05, "loss": 0.6708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6689, "tokens_per_second_per_gpu": 10896.82, "total_tokens": 660588600 }, { "epoch": 0.41822955738934736, "grad_norm": 0.8936483860015869, "learning_rate": 2e-05, "loss": 0.7005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6690, "tokens_per_second_per_gpu": 11297.57, "total_tokens": 660686847 }, { "epoch": 0.41829207301825455, "grad_norm": 0.9107401967048645, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6691, "tokens_per_second_per_gpu": 10181.93, "total_tokens": 660785209 }, { "epoch": 0.4183545886471618, "grad_norm": 0.8665122389793396, "learning_rate": 2e-05, "loss": 0.6283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6692, "tokens_per_second_per_gpu": 10846.98, "total_tokens": 660885616 }, { "epoch": 0.41841710427606904, "grad_norm": 0.850010335445404, "learning_rate": 2e-05, "loss": 0.671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6693, "tokens_per_second_per_gpu": 10770.88, "total_tokens": 660987547 }, { "epoch": 0.41847961990497623, "grad_norm": 0.8854891657829285, "learning_rate": 2e-05, "loss": 0.6628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6694, "tokens_per_second_per_gpu": 9718.71, "total_tokens": 661083842 }, { "epoch": 0.4185421355338835, "grad_norm": 0.8815518021583557, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6695, "tokens_per_second_per_gpu": 9756.39, "total_tokens": 661179267 }, { "epoch": 0.4186046511627907, "grad_norm": 0.8928638100624084, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6696, "tokens_per_second_per_gpu": 9696.0, "total_tokens": 661271318 }, { "epoch": 0.4186671667916979, "grad_norm": 0.9206463694572449, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6697, "tokens_per_second_per_gpu": 11414.62, "total_tokens": 661373113 }, { "epoch": 0.41872968242060515, "grad_norm": 0.9179531931877136, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6698, "tokens_per_second_per_gpu": 10836.02, "total_tokens": 661475332 }, { "epoch": 0.4187921980495124, "grad_norm": 0.9020354151725769, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6699, "tokens_per_second_per_gpu": 10663.32, "total_tokens": 661571103 }, { "epoch": 0.4188547136784196, "grad_norm": 0.952709436416626, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6700, "tokens_per_second_per_gpu": 9988.05, "total_tokens": 661664195 }, { "epoch": 0.41891722930732683, "grad_norm": 0.8979077339172363, "learning_rate": 2e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6701, "tokens_per_second_per_gpu": 10758.04, "total_tokens": 661767072 }, { "epoch": 0.4189797449362341, "grad_norm": 0.8542967438697815, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6702, "tokens_per_second_per_gpu": 10369.16, "total_tokens": 661868807 }, { "epoch": 0.41904226056514127, "grad_norm": 0.910167932510376, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6703, "tokens_per_second_per_gpu": 10119.77, "total_tokens": 661966679 }, { "epoch": 0.4191047761940485, "grad_norm": 0.9289218783378601, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6704, "tokens_per_second_per_gpu": 10009.86, "total_tokens": 662061660 }, { "epoch": 0.41916729182295576, "grad_norm": 0.8976542353630066, "learning_rate": 2e-05, "loss": 0.6752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6705, "tokens_per_second_per_gpu": 11496.51, "total_tokens": 662162216 }, { "epoch": 0.41922980745186295, "grad_norm": 0.9448289275169373, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6706, "tokens_per_second_per_gpu": 10728.71, "total_tokens": 662255531 }, { "epoch": 0.4192923230807702, "grad_norm": 0.9096235036849976, "learning_rate": 2e-05, "loss": 0.6694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6707, "tokens_per_second_per_gpu": 10891.14, "total_tokens": 662349986 }, { "epoch": 0.41935483870967744, "grad_norm": 0.911954939365387, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6708, "tokens_per_second_per_gpu": 10507.79, "total_tokens": 662448751 }, { "epoch": 0.4194173543385846, "grad_norm": 0.9235584735870361, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6709, "tokens_per_second_per_gpu": 11061.26, "total_tokens": 662551213 }, { "epoch": 0.41947986996749187, "grad_norm": 0.8857496976852417, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6710, "tokens_per_second_per_gpu": 10931.86, "total_tokens": 662652161 }, { "epoch": 0.4195423855963991, "grad_norm": 0.8497483730316162, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6711, "tokens_per_second_per_gpu": 10262.11, "total_tokens": 662751512 }, { "epoch": 0.4196049012253063, "grad_norm": 0.8792641758918762, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6712, "tokens_per_second_per_gpu": 10320.74, "total_tokens": 662848600 }, { "epoch": 0.41966741685421355, "grad_norm": 0.8763948678970337, "learning_rate": 2e-05, "loss": 0.6709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6713, "tokens_per_second_per_gpu": 10805.92, "total_tokens": 662950901 }, { "epoch": 0.4197299324831208, "grad_norm": 0.9068217873573303, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6714, "tokens_per_second_per_gpu": 11188.39, "total_tokens": 663053841 }, { "epoch": 0.419792448112028, "grad_norm": 0.936647355556488, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6715, "tokens_per_second_per_gpu": 9688.28, "total_tokens": 663153093 }, { "epoch": 0.4198549637409352, "grad_norm": 0.8615439534187317, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6716, "tokens_per_second_per_gpu": 9916.53, "total_tokens": 663250655 }, { "epoch": 0.4199174793698425, "grad_norm": 0.8566123247146606, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6717, "tokens_per_second_per_gpu": 10675.06, "total_tokens": 663352049 }, { "epoch": 0.41997999499874966, "grad_norm": 0.9454087018966675, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6718, "tokens_per_second_per_gpu": 10419.03, "total_tokens": 663454592 }, { "epoch": 0.4200425106276569, "grad_norm": 0.8920483589172363, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6719, "tokens_per_second_per_gpu": 11230.75, "total_tokens": 663552862 }, { "epoch": 0.42010502625656415, "grad_norm": 0.9488606452941895, "learning_rate": 2e-05, "loss": 0.6185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6720, "tokens_per_second_per_gpu": 10458.62, "total_tokens": 663650987 }, { "epoch": 0.42016754188547134, "grad_norm": 0.8690090179443359, "learning_rate": 2e-05, "loss": 0.6455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6721, "tokens_per_second_per_gpu": 11067.58, "total_tokens": 663747749 }, { "epoch": 0.4202300575143786, "grad_norm": 0.8770521283149719, "learning_rate": 2e-05, "loss": 0.652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6722, "tokens_per_second_per_gpu": 11050.9, "total_tokens": 663847811 }, { "epoch": 0.42029257314328583, "grad_norm": 0.9326553344726562, "learning_rate": 2e-05, "loss": 0.6651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6723, "tokens_per_second_per_gpu": 10977.82, "total_tokens": 663945252 }, { "epoch": 0.4203550887721931, "grad_norm": 0.9343999624252319, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6724, "tokens_per_second_per_gpu": 10707.34, "total_tokens": 664045596 }, { "epoch": 0.42041760440110026, "grad_norm": 0.9107325077056885, "learning_rate": 2e-05, "loss": 0.6915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6725, "tokens_per_second_per_gpu": 11249.73, "total_tokens": 664146216 }, { "epoch": 0.4204801200300075, "grad_norm": 0.9201815724372864, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6726, "tokens_per_second_per_gpu": 10934.55, "total_tokens": 664250594 }, { "epoch": 0.42054263565891475, "grad_norm": 0.8771354556083679, "learning_rate": 2e-05, "loss": 0.6724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6727, "tokens_per_second_per_gpu": 10475.38, "total_tokens": 664347782 }, { "epoch": 0.42060515128782194, "grad_norm": 0.9098698496818542, "learning_rate": 2e-05, "loss": 0.6834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6728, "tokens_per_second_per_gpu": 11039.66, "total_tokens": 664445285 }, { "epoch": 0.4206676669167292, "grad_norm": 0.9145180583000183, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6729, "tokens_per_second_per_gpu": 10388.06, "total_tokens": 664541455 }, { "epoch": 0.42073018254563643, "grad_norm": 0.9126261472702026, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6730, "tokens_per_second_per_gpu": 10710.86, "total_tokens": 664641674 }, { "epoch": 0.4207926981745436, "grad_norm": 0.918818473815918, "learning_rate": 2e-05, "loss": 0.6858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6731, "tokens_per_second_per_gpu": 10188.71, "total_tokens": 664739871 }, { "epoch": 0.42085521380345087, "grad_norm": 0.9243000745773315, "learning_rate": 2e-05, "loss": 0.6945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6732, "tokens_per_second_per_gpu": 11068.93, "total_tokens": 664843351 }, { "epoch": 0.4209177294323581, "grad_norm": 0.9056512117385864, "learning_rate": 2e-05, "loss": 0.7022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6733, "tokens_per_second_per_gpu": 10306.36, "total_tokens": 664945813 }, { "epoch": 0.4209802450612653, "grad_norm": 0.9303063154220581, "learning_rate": 2e-05, "loss": 0.6858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6734, "tokens_per_second_per_gpu": 10215.87, "total_tokens": 665045671 }, { "epoch": 0.42104276069017255, "grad_norm": 0.9257787466049194, "learning_rate": 2e-05, "loss": 0.6775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6735, "tokens_per_second_per_gpu": 10958.36, "total_tokens": 665144585 }, { "epoch": 0.4211052763190798, "grad_norm": 0.9146287441253662, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6736, "tokens_per_second_per_gpu": 10764.34, "total_tokens": 665243501 }, { "epoch": 0.421167791947987, "grad_norm": 0.8971964716911316, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6737, "tokens_per_second_per_gpu": 10261.69, "total_tokens": 665343829 }, { "epoch": 0.4212303075768942, "grad_norm": 0.9002178311347961, "learning_rate": 2e-05, "loss": 0.6053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6738, "tokens_per_second_per_gpu": 10815.18, "total_tokens": 665439729 }, { "epoch": 0.42129282320580147, "grad_norm": 0.9177205562591553, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6739, "tokens_per_second_per_gpu": 10093.05, "total_tokens": 665540820 }, { "epoch": 0.42135533883470866, "grad_norm": 0.9072012305259705, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6740, "tokens_per_second_per_gpu": 10315.45, "total_tokens": 665642123 }, { "epoch": 0.4214178544636159, "grad_norm": 0.9232308268547058, "learning_rate": 2e-05, "loss": 0.6718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6741, "tokens_per_second_per_gpu": 11017.48, "total_tokens": 665742602 }, { "epoch": 0.42148037009252315, "grad_norm": 0.8761455416679382, "learning_rate": 2e-05, "loss": 0.6816, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6742, "tokens_per_second_per_gpu": 11146.13, "total_tokens": 665844899 }, { "epoch": 0.42154288572143034, "grad_norm": 0.8989574909210205, "learning_rate": 2e-05, "loss": 0.6911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6743, "tokens_per_second_per_gpu": 10916.22, "total_tokens": 665944125 }, { "epoch": 0.4216054013503376, "grad_norm": 0.9113240838050842, "learning_rate": 2e-05, "loss": 0.6566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6744, "tokens_per_second_per_gpu": 10140.31, "total_tokens": 666040001 }, { "epoch": 0.42166791697924483, "grad_norm": 0.9199201464653015, "learning_rate": 2e-05, "loss": 0.6607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6745, "tokens_per_second_per_gpu": 10561.04, "total_tokens": 666138667 }, { "epoch": 0.421730432608152, "grad_norm": 0.8848922848701477, "learning_rate": 2e-05, "loss": 0.6748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6746, "tokens_per_second_per_gpu": 10863.03, "total_tokens": 666240836 }, { "epoch": 0.42179294823705926, "grad_norm": 0.8944647312164307, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6747, "tokens_per_second_per_gpu": 10086.03, "total_tokens": 666334932 }, { "epoch": 0.4218554638659665, "grad_norm": 0.8690907955169678, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6748, "tokens_per_second_per_gpu": 10746.84, "total_tokens": 666433236 }, { "epoch": 0.4219179794948737, "grad_norm": 0.9119105935096741, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6749, "tokens_per_second_per_gpu": 10713.39, "total_tokens": 666533128 }, { "epoch": 0.42198049512378094, "grad_norm": 0.9373078942298889, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6750, "tokens_per_second_per_gpu": 10432.39, "total_tokens": 666630350 }, { "epoch": 0.4220430107526882, "grad_norm": 0.9115379452705383, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6751, "tokens_per_second_per_gpu": 10433.67, "total_tokens": 666727626 }, { "epoch": 0.4221055263815954, "grad_norm": 0.8840534090995789, "learning_rate": 2e-05, "loss": 0.5883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6752, "tokens_per_second_per_gpu": 9788.19, "total_tokens": 666821115 }, { "epoch": 0.4221680420105026, "grad_norm": 0.8795719742774963, "learning_rate": 2e-05, "loss": 0.6856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6753, "tokens_per_second_per_gpu": 11171.95, "total_tokens": 666922823 }, { "epoch": 0.42223055763940986, "grad_norm": 0.9217066168785095, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6754, "tokens_per_second_per_gpu": 10460.03, "total_tokens": 667021207 }, { "epoch": 0.42229307326831705, "grad_norm": 0.9966482520103455, "learning_rate": 2e-05, "loss": 0.6526, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6755, "tokens_per_second_per_gpu": 10480.15, "total_tokens": 667115658 }, { "epoch": 0.4223555888972243, "grad_norm": 0.9104553461074829, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6756, "tokens_per_second_per_gpu": 10144.76, "total_tokens": 667211766 }, { "epoch": 0.42241810452613154, "grad_norm": 0.86795973777771, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6757, "tokens_per_second_per_gpu": 9934.46, "total_tokens": 667310051 }, { "epoch": 0.42248062015503873, "grad_norm": 0.9277563095092773, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6758, "tokens_per_second_per_gpu": 9979.15, "total_tokens": 667407475 }, { "epoch": 0.422543135783946, "grad_norm": 0.8994843363761902, "learning_rate": 2e-05, "loss": 0.6671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6759, "tokens_per_second_per_gpu": 10898.83, "total_tokens": 667507910 }, { "epoch": 0.4226056514128532, "grad_norm": 0.9180812239646912, "learning_rate": 2e-05, "loss": 0.6941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6760, "tokens_per_second_per_gpu": 10420.82, "total_tokens": 667608465 }, { "epoch": 0.42266816704176047, "grad_norm": 0.9611603617668152, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6761, "tokens_per_second_per_gpu": 10166.92, "total_tokens": 667704774 }, { "epoch": 0.42273068267066766, "grad_norm": 0.9321263432502747, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6762, "tokens_per_second_per_gpu": 10210.61, "total_tokens": 667804788 }, { "epoch": 0.4227931982995749, "grad_norm": 0.8738084435462952, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6763, "tokens_per_second_per_gpu": 10118.69, "total_tokens": 667903162 }, { "epoch": 0.42285571392848215, "grad_norm": 0.9379053115844727, "learning_rate": 2e-05, "loss": 0.698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6764, "tokens_per_second_per_gpu": 10013.87, "total_tokens": 668001500 }, { "epoch": 0.42291822955738934, "grad_norm": 0.9293761253356934, "learning_rate": 2e-05, "loss": 0.7102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6765, "tokens_per_second_per_gpu": 11014.95, "total_tokens": 668103954 }, { "epoch": 0.4229807451862966, "grad_norm": 0.9128056168556213, "learning_rate": 2e-05, "loss": 0.6703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6766, "tokens_per_second_per_gpu": 10820.31, "total_tokens": 668203712 }, { "epoch": 0.4230432608152038, "grad_norm": 0.960635244846344, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6767, "tokens_per_second_per_gpu": 10339.13, "total_tokens": 668300761 }, { "epoch": 0.423105776444111, "grad_norm": 0.8825883269309998, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6768, "tokens_per_second_per_gpu": 11050.5, "total_tokens": 668403382 }, { "epoch": 0.42316829207301826, "grad_norm": 0.8655561208724976, "learning_rate": 2e-05, "loss": 0.6819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6769, "tokens_per_second_per_gpu": 11527.26, "total_tokens": 668507737 }, { "epoch": 0.4232308077019255, "grad_norm": 0.8814163208007812, "learning_rate": 2e-05, "loss": 0.6904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6770, "tokens_per_second_per_gpu": 10635.24, "total_tokens": 668613475 }, { "epoch": 0.4232933233308327, "grad_norm": 0.9061641693115234, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6771, "tokens_per_second_per_gpu": 10066.34, "total_tokens": 668711786 }, { "epoch": 0.42335583895973994, "grad_norm": 0.9313451051712036, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6772, "tokens_per_second_per_gpu": 9830.18, "total_tokens": 668806834 }, { "epoch": 0.4234183545886472, "grad_norm": 0.8645527362823486, "learning_rate": 2e-05, "loss": 0.66, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6773, "tokens_per_second_per_gpu": 10749.7, "total_tokens": 668906416 }, { "epoch": 0.4234808702175544, "grad_norm": 0.8856606483459473, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6774, "tokens_per_second_per_gpu": 10190.7, "total_tokens": 669003141 }, { "epoch": 0.4235433858464616, "grad_norm": 0.9185560345649719, "learning_rate": 2e-05, "loss": 0.694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6775, "tokens_per_second_per_gpu": 10293.83, "total_tokens": 669098229 }, { "epoch": 0.42360590147536886, "grad_norm": 0.8752070069313049, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6776, "tokens_per_second_per_gpu": 9795.96, "total_tokens": 669196589 }, { "epoch": 0.42366841710427605, "grad_norm": 0.8896348476409912, "learning_rate": 2e-05, "loss": 0.6412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6777, "tokens_per_second_per_gpu": 9954.82, "total_tokens": 669293295 }, { "epoch": 0.4237309327331833, "grad_norm": 0.8505048155784607, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6778, "tokens_per_second_per_gpu": 10167.7, "total_tokens": 669391838 }, { "epoch": 0.42379344836209054, "grad_norm": 0.9152548909187317, "learning_rate": 2e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6779, "tokens_per_second_per_gpu": 10958.97, "total_tokens": 669490140 }, { "epoch": 0.42385596399099773, "grad_norm": 0.8314611911773682, "learning_rate": 2e-05, "loss": 0.5938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6780, "tokens_per_second_per_gpu": 10746.66, "total_tokens": 669589591 }, { "epoch": 0.423918479619905, "grad_norm": 0.8405218720436096, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6781, "tokens_per_second_per_gpu": 10394.31, "total_tokens": 669688201 }, { "epoch": 0.4239809952488122, "grad_norm": 0.9067569375038147, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6782, "tokens_per_second_per_gpu": 10307.04, "total_tokens": 669785194 }, { "epoch": 0.4240435108777194, "grad_norm": 0.9013831615447998, "learning_rate": 2e-05, "loss": 0.6003, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6783, "tokens_per_second_per_gpu": 9690.81, "total_tokens": 669877638 }, { "epoch": 0.42410602650662665, "grad_norm": 0.8499979376792908, "learning_rate": 2e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6784, "tokens_per_second_per_gpu": 11496.38, "total_tokens": 669981600 }, { "epoch": 0.4241685421355339, "grad_norm": 0.9033688902854919, "learning_rate": 2e-05, "loss": 0.6816, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6785, "tokens_per_second_per_gpu": 10847.49, "total_tokens": 670078447 }, { "epoch": 0.4242310577644411, "grad_norm": 0.8473495244979858, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6786, "tokens_per_second_per_gpu": 10878.89, "total_tokens": 670179741 }, { "epoch": 0.42429357339334833, "grad_norm": 0.9744210243225098, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6787, "tokens_per_second_per_gpu": 10250.26, "total_tokens": 670279931 }, { "epoch": 0.4243560890222556, "grad_norm": 0.9337438344955444, "learning_rate": 2e-05, "loss": 0.6676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6788, "tokens_per_second_per_gpu": 10231.02, "total_tokens": 670375936 }, { "epoch": 0.42441860465116277, "grad_norm": 0.9056283235549927, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6789, "tokens_per_second_per_gpu": 11419.3, "total_tokens": 670478210 }, { "epoch": 0.42448112028007, "grad_norm": 0.9017273783683777, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6790, "tokens_per_second_per_gpu": 10739.49, "total_tokens": 670575484 }, { "epoch": 0.42454363590897726, "grad_norm": 0.9267946481704712, "learning_rate": 2e-05, "loss": 0.6626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6791, "tokens_per_second_per_gpu": 9619.28, "total_tokens": 670672955 }, { "epoch": 0.42460615153788445, "grad_norm": 0.8766735196113586, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6792, "tokens_per_second_per_gpu": 11152.6, "total_tokens": 670774035 }, { "epoch": 0.4246686671667917, "grad_norm": 0.8911466598510742, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6793, "tokens_per_second_per_gpu": 10732.59, "total_tokens": 670877329 }, { "epoch": 0.42473118279569894, "grad_norm": 0.9075658321380615, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6794, "tokens_per_second_per_gpu": 10119.45, "total_tokens": 670976269 }, { "epoch": 0.4247936984246061, "grad_norm": 0.9041377902030945, "learning_rate": 2e-05, "loss": 0.68, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6795, "tokens_per_second_per_gpu": 10299.3, "total_tokens": 671076861 }, { "epoch": 0.42485621405351337, "grad_norm": 0.9019074440002441, "learning_rate": 2e-05, "loss": 0.6788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6796, "tokens_per_second_per_gpu": 10699.11, "total_tokens": 671177700 }, { "epoch": 0.4249187296824206, "grad_norm": 0.9435154795646667, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6797, "tokens_per_second_per_gpu": 10569.58, "total_tokens": 671274824 }, { "epoch": 0.4249812453113278, "grad_norm": 1.1160579919815063, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6798, "tokens_per_second_per_gpu": 10787.79, "total_tokens": 671374598 }, { "epoch": 0.42504376094023505, "grad_norm": 0.8760924935340881, "learning_rate": 2e-05, "loss": 0.6515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6799, "tokens_per_second_per_gpu": 9990.15, "total_tokens": 671472395 }, { "epoch": 0.4251062765691423, "grad_norm": 0.9005284905433655, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6800, "tokens_per_second_per_gpu": 10209.77, "total_tokens": 671571019 }, { "epoch": 0.42516879219804954, "grad_norm": 0.9399028420448303, "learning_rate": 2e-05, "loss": 0.6738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6801, "tokens_per_second_per_gpu": 9859.88, "total_tokens": 671666992 }, { "epoch": 0.42523130782695673, "grad_norm": 0.887530505657196, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6802, "tokens_per_second_per_gpu": 10698.21, "total_tokens": 671769216 }, { "epoch": 0.425293823455864, "grad_norm": 0.8809298872947693, "learning_rate": 2e-05, "loss": 0.6633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6803, "tokens_per_second_per_gpu": 10057.22, "total_tokens": 671868765 }, { "epoch": 0.4253563390847712, "grad_norm": 0.881600022315979, "learning_rate": 2e-05, "loss": 0.6654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6804, "tokens_per_second_per_gpu": 10632.55, "total_tokens": 671972848 }, { "epoch": 0.4254188547136784, "grad_norm": 0.8646917939186096, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6805, "tokens_per_second_per_gpu": 10619.86, "total_tokens": 672068055 }, { "epoch": 0.42548137034258565, "grad_norm": 0.9051732420921326, "learning_rate": 2e-05, "loss": 0.7015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6806, "tokens_per_second_per_gpu": 10805.5, "total_tokens": 672169380 }, { "epoch": 0.4255438859714929, "grad_norm": 0.8877246379852295, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6807, "tokens_per_second_per_gpu": 11454.84, "total_tokens": 672268723 }, { "epoch": 0.4256064016004001, "grad_norm": 0.9055740237236023, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6808, "tokens_per_second_per_gpu": 10438.3, "total_tokens": 672369290 }, { "epoch": 0.42566891722930733, "grad_norm": 0.9014304876327515, "learning_rate": 2e-05, "loss": 0.5911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6809, "tokens_per_second_per_gpu": 9525.35, "total_tokens": 672460463 }, { "epoch": 0.4257314328582146, "grad_norm": 0.9063059091567993, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6810, "tokens_per_second_per_gpu": 10300.53, "total_tokens": 672561982 }, { "epoch": 0.42579394848712177, "grad_norm": 0.8814898729324341, "learning_rate": 2e-05, "loss": 0.6876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6811, "tokens_per_second_per_gpu": 10383.14, "total_tokens": 672664238 }, { "epoch": 0.425856464116029, "grad_norm": 0.8980963230133057, "learning_rate": 2e-05, "loss": 0.59, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6812, "tokens_per_second_per_gpu": 9831.03, "total_tokens": 672757331 }, { "epoch": 0.42591897974493625, "grad_norm": 0.878997802734375, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6813, "tokens_per_second_per_gpu": 11309.7, "total_tokens": 672861918 }, { "epoch": 0.42598149537384344, "grad_norm": 0.9032399654388428, "learning_rate": 2e-05, "loss": 0.6872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6814, "tokens_per_second_per_gpu": 10658.21, "total_tokens": 672965531 }, { "epoch": 0.4260440110027507, "grad_norm": 0.9164764881134033, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6815, "tokens_per_second_per_gpu": 10801.9, "total_tokens": 673066164 }, { "epoch": 0.42610652663165793, "grad_norm": 0.8863667249679565, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6816, "tokens_per_second_per_gpu": 10027.67, "total_tokens": 673163893 }, { "epoch": 0.4261690422605651, "grad_norm": 0.8640396595001221, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6817, "tokens_per_second_per_gpu": 10612.31, "total_tokens": 673261054 }, { "epoch": 0.42623155788947237, "grad_norm": 0.878024697303772, "learning_rate": 2e-05, "loss": 0.6792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6818, "tokens_per_second_per_gpu": 11207.98, "total_tokens": 673367154 }, { "epoch": 0.4262940735183796, "grad_norm": 0.9045286178588867, "learning_rate": 2e-05, "loss": 0.6701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6819, "tokens_per_second_per_gpu": 10422.71, "total_tokens": 673467740 }, { "epoch": 0.4263565891472868, "grad_norm": 0.8998208045959473, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6820, "tokens_per_second_per_gpu": 10163.24, "total_tokens": 673567652 }, { "epoch": 0.42641910477619405, "grad_norm": 0.9915116429328918, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6821, "tokens_per_second_per_gpu": 11236.28, "total_tokens": 673670703 }, { "epoch": 0.4264816204051013, "grad_norm": 0.8780509233474731, "learning_rate": 2e-05, "loss": 0.6412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6822, "tokens_per_second_per_gpu": 9830.67, "total_tokens": 673768615 }, { "epoch": 0.4265441360340085, "grad_norm": 0.8838528394699097, "learning_rate": 2e-05, "loss": 0.6448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6823, "tokens_per_second_per_gpu": 10595.6, "total_tokens": 673867044 }, { "epoch": 0.4266066516629157, "grad_norm": 0.9543107748031616, "learning_rate": 2e-05, "loss": 0.6927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6824, "tokens_per_second_per_gpu": 10244.53, "total_tokens": 673965029 }, { "epoch": 0.42666916729182297, "grad_norm": 1.2280473709106445, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6825, "tokens_per_second_per_gpu": 9904.59, "total_tokens": 674063041 }, { "epoch": 0.42673168292073016, "grad_norm": 0.938144862651825, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6826, "tokens_per_second_per_gpu": 10547.18, "total_tokens": 674162938 }, { "epoch": 0.4267941985496374, "grad_norm": 0.8753791451454163, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6827, "tokens_per_second_per_gpu": 10598.04, "total_tokens": 674259647 }, { "epoch": 0.42685671417854465, "grad_norm": 0.9270891547203064, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6828, "tokens_per_second_per_gpu": 10085.73, "total_tokens": 674357605 }, { "epoch": 0.42691922980745184, "grad_norm": 0.9456541538238525, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6829, "tokens_per_second_per_gpu": 10182.51, "total_tokens": 674453554 }, { "epoch": 0.4269817454363591, "grad_norm": 0.9450266361236572, "learning_rate": 2e-05, "loss": 0.682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6830, "tokens_per_second_per_gpu": 11243.22, "total_tokens": 674555316 }, { "epoch": 0.42704426106526633, "grad_norm": 0.8965449929237366, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6831, "tokens_per_second_per_gpu": 9975.81, "total_tokens": 674652928 }, { "epoch": 0.4271067766941735, "grad_norm": 0.909710168838501, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6832, "tokens_per_second_per_gpu": 10267.43, "total_tokens": 674746405 }, { "epoch": 0.42716929232308076, "grad_norm": 0.9017312526702881, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6833, "tokens_per_second_per_gpu": 10981.47, "total_tokens": 674845432 }, { "epoch": 0.427231807951988, "grad_norm": 0.9223718047142029, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6834, "tokens_per_second_per_gpu": 11232.07, "total_tokens": 674945831 }, { "epoch": 0.4272943235808952, "grad_norm": 0.9000111222267151, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6835, "tokens_per_second_per_gpu": 9594.57, "total_tokens": 675044404 }, { "epoch": 0.42735683920980244, "grad_norm": 0.9131481051445007, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6836, "tokens_per_second_per_gpu": 10054.2, "total_tokens": 675143226 }, { "epoch": 0.4274193548387097, "grad_norm": 0.8761589527130127, "learning_rate": 2e-05, "loss": 0.6356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6837, "tokens_per_second_per_gpu": 10255.03, "total_tokens": 675240556 }, { "epoch": 0.42748187046761693, "grad_norm": 0.9163235425949097, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6838, "tokens_per_second_per_gpu": 10549.5, "total_tokens": 675338581 }, { "epoch": 0.4275443860965241, "grad_norm": 0.919103741645813, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6839, "tokens_per_second_per_gpu": 10824.78, "total_tokens": 675441532 }, { "epoch": 0.42760690172543137, "grad_norm": 0.8898981213569641, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6840, "tokens_per_second_per_gpu": 11064.26, "total_tokens": 675541696 }, { "epoch": 0.4276694173543386, "grad_norm": 0.92947918176651, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6841, "tokens_per_second_per_gpu": 10302.86, "total_tokens": 675638629 }, { "epoch": 0.4277319329832458, "grad_norm": 0.8763963580131531, "learning_rate": 2e-05, "loss": 0.6748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6842, "tokens_per_second_per_gpu": 10751.45, "total_tokens": 675738152 }, { "epoch": 0.42779444861215304, "grad_norm": 0.8973132371902466, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6843, "tokens_per_second_per_gpu": 10867.67, "total_tokens": 675842467 }, { "epoch": 0.4278569642410603, "grad_norm": 0.9190518856048584, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6844, "tokens_per_second_per_gpu": 9835.4, "total_tokens": 675937433 }, { "epoch": 0.4279194798699675, "grad_norm": 0.881558358669281, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6845, "tokens_per_second_per_gpu": 10237.72, "total_tokens": 676039364 }, { "epoch": 0.4279819954988747, "grad_norm": 0.8672465682029724, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6846, "tokens_per_second_per_gpu": 10762.99, "total_tokens": 676142397 }, { "epoch": 0.42804451112778197, "grad_norm": 0.8858804702758789, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6847, "tokens_per_second_per_gpu": 11241.19, "total_tokens": 676244768 }, { "epoch": 0.42810702675668916, "grad_norm": 0.9031122326850891, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6848, "tokens_per_second_per_gpu": 10284.44, "total_tokens": 676339498 }, { "epoch": 0.4281695423855964, "grad_norm": 0.9312429428100586, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6849, "tokens_per_second_per_gpu": 10782.95, "total_tokens": 676440028 }, { "epoch": 0.42823205801450365, "grad_norm": 0.8684501051902771, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6850, "tokens_per_second_per_gpu": 10230.87, "total_tokens": 676540219 }, { "epoch": 0.42829457364341084, "grad_norm": 0.8983113765716553, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6851, "tokens_per_second_per_gpu": 9988.76, "total_tokens": 676636911 }, { "epoch": 0.4283570892723181, "grad_norm": 0.8800249099731445, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6852, "tokens_per_second_per_gpu": 10238.16, "total_tokens": 676736591 }, { "epoch": 0.4284196049012253, "grad_norm": 0.9000704288482666, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6853, "tokens_per_second_per_gpu": 9439.21, "total_tokens": 676830327 }, { "epoch": 0.4284821205301325, "grad_norm": 0.8955562114715576, "learning_rate": 2e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6854, "tokens_per_second_per_gpu": 10853.89, "total_tokens": 676930310 }, { "epoch": 0.42854463615903976, "grad_norm": 0.9212886691093445, "learning_rate": 2e-05, "loss": 0.6206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6855, "tokens_per_second_per_gpu": 10449.98, "total_tokens": 677026293 }, { "epoch": 0.428607151787947, "grad_norm": 0.8773530721664429, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6856, "tokens_per_second_per_gpu": 11042.56, "total_tokens": 677127027 }, { "epoch": 0.4286696674168542, "grad_norm": 0.893596887588501, "learning_rate": 2e-05, "loss": 0.6855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6857, "tokens_per_second_per_gpu": 10653.6, "total_tokens": 677227347 }, { "epoch": 0.42873218304576144, "grad_norm": 0.895227313041687, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6858, "tokens_per_second_per_gpu": 9986.76, "total_tokens": 677322662 }, { "epoch": 0.4287946986746687, "grad_norm": 0.8610924482345581, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6859, "tokens_per_second_per_gpu": 10590.98, "total_tokens": 677422522 }, { "epoch": 0.4288572143035759, "grad_norm": 0.8810421228408813, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6860, "tokens_per_second_per_gpu": 10775.48, "total_tokens": 677525618 }, { "epoch": 0.4289197299324831, "grad_norm": 0.901414155960083, "learning_rate": 2e-05, "loss": 0.6897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6861, "tokens_per_second_per_gpu": 11709.65, "total_tokens": 677626450 }, { "epoch": 0.42898224556139036, "grad_norm": 0.8831868767738342, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6862, "tokens_per_second_per_gpu": 10749.48, "total_tokens": 677724693 }, { "epoch": 0.42904476119029755, "grad_norm": 0.9040478467941284, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6863, "tokens_per_second_per_gpu": 10575.84, "total_tokens": 677829691 }, { "epoch": 0.4291072768192048, "grad_norm": 0.8576968908309937, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6864, "tokens_per_second_per_gpu": 10492.83, "total_tokens": 677930636 }, { "epoch": 0.42916979244811204, "grad_norm": 0.8637682199478149, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6865, "tokens_per_second_per_gpu": 11167.14, "total_tokens": 678031820 }, { "epoch": 0.42923230807701923, "grad_norm": 0.8764199018478394, "learning_rate": 2e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6866, "tokens_per_second_per_gpu": 10626.4, "total_tokens": 678127883 }, { "epoch": 0.4292948237059265, "grad_norm": 0.8719032406806946, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6867, "tokens_per_second_per_gpu": 10652.9, "total_tokens": 678228124 }, { "epoch": 0.4293573393348337, "grad_norm": 0.8854702115058899, "learning_rate": 2e-05, "loss": 0.6948, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6868, "tokens_per_second_per_gpu": 11044.58, "total_tokens": 678330528 }, { "epoch": 0.4294198549637409, "grad_norm": 0.8530554175376892, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6869, "tokens_per_second_per_gpu": 11434.89, "total_tokens": 678436118 }, { "epoch": 0.42948237059264815, "grad_norm": 0.9035859704017639, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6870, "tokens_per_second_per_gpu": 10507.05, "total_tokens": 678534454 }, { "epoch": 0.4295448862215554, "grad_norm": 0.9294997453689575, "learning_rate": 2e-05, "loss": 0.7075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6871, "tokens_per_second_per_gpu": 10534.25, "total_tokens": 678636066 }, { "epoch": 0.4296074018504626, "grad_norm": 0.8582906723022461, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6872, "tokens_per_second_per_gpu": 10031.09, "total_tokens": 678734218 }, { "epoch": 0.42966991747936983, "grad_norm": 0.849541962146759, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6873, "tokens_per_second_per_gpu": 10682.06, "total_tokens": 678835923 }, { "epoch": 0.4297324331082771, "grad_norm": 0.903638482093811, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6874, "tokens_per_second_per_gpu": 9519.24, "total_tokens": 678929709 }, { "epoch": 0.4297949487371843, "grad_norm": 0.8789188861846924, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6875, "tokens_per_second_per_gpu": 11480.95, "total_tokens": 679030630 }, { "epoch": 0.4298574643660915, "grad_norm": 0.8858581185340881, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6876, "tokens_per_second_per_gpu": 9793.65, "total_tokens": 679128389 }, { "epoch": 0.42991997999499876, "grad_norm": 0.8767330646514893, "learning_rate": 2e-05, "loss": 0.6639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6877, "tokens_per_second_per_gpu": 10540.03, "total_tokens": 679227601 }, { "epoch": 0.429982495623906, "grad_norm": 0.8972876071929932, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6878, "tokens_per_second_per_gpu": 10115.37, "total_tokens": 679324289 }, { "epoch": 0.4300450112528132, "grad_norm": 0.8915911912918091, "learning_rate": 2e-05, "loss": 0.6723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6879, "tokens_per_second_per_gpu": 10507.05, "total_tokens": 679424262 }, { "epoch": 0.43010752688172044, "grad_norm": 0.8779021501541138, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6880, "tokens_per_second_per_gpu": 10579.73, "total_tokens": 679526485 }, { "epoch": 0.4301700425106277, "grad_norm": 0.8968715667724609, "learning_rate": 2e-05, "loss": 0.7045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6881, "tokens_per_second_per_gpu": 10668.61, "total_tokens": 679626030 }, { "epoch": 0.43023255813953487, "grad_norm": 0.9084022641181946, "learning_rate": 2e-05, "loss": 0.6318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6882, "tokens_per_second_per_gpu": 10895.32, "total_tokens": 679725626 }, { "epoch": 0.4302950737684421, "grad_norm": 0.8890919089317322, "learning_rate": 2e-05, "loss": 0.6211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6883, "tokens_per_second_per_gpu": 10269.99, "total_tokens": 679820281 }, { "epoch": 0.43035758939734936, "grad_norm": 0.8835004568099976, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6884, "tokens_per_second_per_gpu": 10957.35, "total_tokens": 679919965 }, { "epoch": 0.43042010502625655, "grad_norm": 0.9011564254760742, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6885, "tokens_per_second_per_gpu": 10520.28, "total_tokens": 680018605 }, { "epoch": 0.4304826206551638, "grad_norm": 0.9089033007621765, "learning_rate": 2e-05, "loss": 0.6627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6886, "tokens_per_second_per_gpu": 10564.79, "total_tokens": 680115937 }, { "epoch": 0.43054513628407104, "grad_norm": 0.9004768133163452, "learning_rate": 2e-05, "loss": 0.6325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6887, "tokens_per_second_per_gpu": 10141.42, "total_tokens": 680212041 }, { "epoch": 0.43060765191297823, "grad_norm": 0.849800169467926, "learning_rate": 2e-05, "loss": 0.6425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6888, "tokens_per_second_per_gpu": 9947.24, "total_tokens": 680311538 }, { "epoch": 0.4306701675418855, "grad_norm": 0.8974554538726807, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6889, "tokens_per_second_per_gpu": 10216.6, "total_tokens": 680407852 }, { "epoch": 0.4307326831707927, "grad_norm": 0.8933639526367188, "learning_rate": 2e-05, "loss": 0.6462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6890, "tokens_per_second_per_gpu": 15652.66, "total_tokens": 680507775 }, { "epoch": 0.4307951987996999, "grad_norm": 0.9007205367088318, "learning_rate": 2e-05, "loss": 0.6695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6891, "tokens_per_second_per_gpu": 11159.58, "total_tokens": 680609264 }, { "epoch": 0.43085771442860715, "grad_norm": 0.9018850326538086, "learning_rate": 2e-05, "loss": 0.6839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6892, "tokens_per_second_per_gpu": 10754.37, "total_tokens": 680709426 }, { "epoch": 0.4309202300575144, "grad_norm": 0.9056071639060974, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6893, "tokens_per_second_per_gpu": 11107.6, "total_tokens": 680813674 }, { "epoch": 0.4309827456864216, "grad_norm": 0.9416744112968445, "learning_rate": 2e-05, "loss": 0.7079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6894, "tokens_per_second_per_gpu": 11003.43, "total_tokens": 680915587 }, { "epoch": 0.43104526131532883, "grad_norm": 0.9024548530578613, "learning_rate": 2e-05, "loss": 0.6592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6895, "tokens_per_second_per_gpu": 10578.47, "total_tokens": 681016158 }, { "epoch": 0.4311077769442361, "grad_norm": 0.8856962323188782, "learning_rate": 2e-05, "loss": 0.7029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6896, "tokens_per_second_per_gpu": 11223.32, "total_tokens": 681120210 }, { "epoch": 0.43117029257314327, "grad_norm": 0.8630344867706299, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6897, "tokens_per_second_per_gpu": 10695.08, "total_tokens": 681221901 }, { "epoch": 0.4312328082020505, "grad_norm": 0.8727108836174011, "learning_rate": 2e-05, "loss": 0.6576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6898, "tokens_per_second_per_gpu": 11178.04, "total_tokens": 681323328 }, { "epoch": 0.43129532383095776, "grad_norm": 0.9272902011871338, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6899, "tokens_per_second_per_gpu": 10478.01, "total_tokens": 681424523 }, { "epoch": 0.43135783945986494, "grad_norm": 0.8965616226196289, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6900, "tokens_per_second_per_gpu": 10565.49, "total_tokens": 681522887 }, { "epoch": 0.4314203550887722, "grad_norm": 0.8893905282020569, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6901, "tokens_per_second_per_gpu": 10308.02, "total_tokens": 681621206 }, { "epoch": 0.43148287071767943, "grad_norm": 0.8977298140525818, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6902, "tokens_per_second_per_gpu": 10122.34, "total_tokens": 681714905 }, { "epoch": 0.4315453863465866, "grad_norm": 0.8892102837562561, "learning_rate": 2e-05, "loss": 0.6741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6903, "tokens_per_second_per_gpu": 10638.45, "total_tokens": 681813711 }, { "epoch": 0.43160790197549387, "grad_norm": 0.8685736656188965, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6904, "tokens_per_second_per_gpu": 10434.05, "total_tokens": 681912750 }, { "epoch": 0.4316704176044011, "grad_norm": 0.9347820281982422, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6905, "tokens_per_second_per_gpu": 9945.12, "total_tokens": 682008937 }, { "epoch": 0.4317329332333083, "grad_norm": 0.8777009844779968, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6906, "tokens_per_second_per_gpu": 10182.71, "total_tokens": 682107824 }, { "epoch": 0.43179544886221555, "grad_norm": 0.8819359540939331, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6907, "tokens_per_second_per_gpu": 9673.84, "total_tokens": 682204684 }, { "epoch": 0.4318579644911228, "grad_norm": 0.9121957421302795, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6908, "tokens_per_second_per_gpu": 10820.46, "total_tokens": 682306216 }, { "epoch": 0.43192048012003, "grad_norm": 0.878040611743927, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6909, "tokens_per_second_per_gpu": 10378.44, "total_tokens": 682402581 }, { "epoch": 0.4319829957489372, "grad_norm": 0.859501302242279, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6910, "tokens_per_second_per_gpu": 10281.12, "total_tokens": 682502629 }, { "epoch": 0.43204551137784447, "grad_norm": 0.9096227288246155, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6911, "tokens_per_second_per_gpu": 9381.67, "total_tokens": 682597834 }, { "epoch": 0.43210802700675166, "grad_norm": 0.8913900256156921, "learning_rate": 2e-05, "loss": 0.664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6912, "tokens_per_second_per_gpu": 10751.3, "total_tokens": 682701053 }, { "epoch": 0.4321705426356589, "grad_norm": 0.8981696963310242, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6913, "tokens_per_second_per_gpu": 10197.97, "total_tokens": 682796108 }, { "epoch": 0.43223305826456615, "grad_norm": 0.8992924690246582, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6914, "tokens_per_second_per_gpu": 11044.76, "total_tokens": 682891747 }, { "epoch": 0.4322955738934734, "grad_norm": 0.879564106464386, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6915, "tokens_per_second_per_gpu": 9427.43, "total_tokens": 682989397 }, { "epoch": 0.4323580895223806, "grad_norm": 0.9042783379554749, "learning_rate": 2e-05, "loss": 0.6714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6916, "tokens_per_second_per_gpu": 10073.05, "total_tokens": 683085734 }, { "epoch": 0.43242060515128783, "grad_norm": 0.9018633365631104, "learning_rate": 2e-05, "loss": 0.6435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6917, "tokens_per_second_per_gpu": 10431.92, "total_tokens": 683180333 }, { "epoch": 0.4324831207801951, "grad_norm": 0.8864091634750366, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6918, "tokens_per_second_per_gpu": 10297.91, "total_tokens": 683275288 }, { "epoch": 0.43254563640910226, "grad_norm": 0.8790479302406311, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6919, "tokens_per_second_per_gpu": 10613.25, "total_tokens": 683374390 }, { "epoch": 0.4326081520380095, "grad_norm": 0.9126395583152771, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6920, "tokens_per_second_per_gpu": 10373.77, "total_tokens": 683475061 }, { "epoch": 0.43267066766691675, "grad_norm": 0.956858217716217, "learning_rate": 2e-05, "loss": 0.6317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6921, "tokens_per_second_per_gpu": 9676.58, "total_tokens": 683569823 }, { "epoch": 0.43273318329582394, "grad_norm": 0.9095152616500854, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6922, "tokens_per_second_per_gpu": 10655.23, "total_tokens": 683669122 }, { "epoch": 0.4327956989247312, "grad_norm": 0.9037536382675171, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6923, "tokens_per_second_per_gpu": 9928.41, "total_tokens": 683765979 }, { "epoch": 0.43285821455363843, "grad_norm": 0.8739933371543884, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6924, "tokens_per_second_per_gpu": 10557.21, "total_tokens": 683865675 }, { "epoch": 0.4329207301825456, "grad_norm": 0.9299914240837097, "learning_rate": 2e-05, "loss": 0.6526, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6925, "tokens_per_second_per_gpu": 10568.26, "total_tokens": 683962387 }, { "epoch": 0.43298324581145287, "grad_norm": 0.8874809741973877, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6926, "tokens_per_second_per_gpu": 9962.06, "total_tokens": 684059243 }, { "epoch": 0.4330457614403601, "grad_norm": 0.9112151861190796, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6927, "tokens_per_second_per_gpu": 10180.51, "total_tokens": 684159064 }, { "epoch": 0.4331082770692673, "grad_norm": 0.8975244164466858, "learning_rate": 2e-05, "loss": 0.6695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6928, "tokens_per_second_per_gpu": 11135.92, "total_tokens": 684257808 }, { "epoch": 0.43317079269817454, "grad_norm": 0.8828606605529785, "learning_rate": 2e-05, "loss": 0.6226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6929, "tokens_per_second_per_gpu": 10409.35, "total_tokens": 684352661 }, { "epoch": 0.4332333083270818, "grad_norm": 0.9226715564727783, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6930, "tokens_per_second_per_gpu": 10749.14, "total_tokens": 684453187 }, { "epoch": 0.433295823955989, "grad_norm": 0.8747329711914062, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6931, "tokens_per_second_per_gpu": 10369.38, "total_tokens": 684551109 }, { "epoch": 0.4333583395848962, "grad_norm": 0.8614431619644165, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6932, "tokens_per_second_per_gpu": 10001.67, "total_tokens": 684648588 }, { "epoch": 0.43342085521380347, "grad_norm": 0.8832631707191467, "learning_rate": 2e-05, "loss": 0.6805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6933, "tokens_per_second_per_gpu": 10748.5, "total_tokens": 684748656 }, { "epoch": 0.43348337084271066, "grad_norm": 0.8850008845329285, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6934, "tokens_per_second_per_gpu": 10352.17, "total_tokens": 684848384 }, { "epoch": 0.4335458864716179, "grad_norm": 0.8914068937301636, "learning_rate": 2e-05, "loss": 0.6644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6935, "tokens_per_second_per_gpu": 10405.99, "total_tokens": 684948198 }, { "epoch": 0.43360840210052515, "grad_norm": 0.8929901123046875, "learning_rate": 2e-05, "loss": 0.6817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6936, "tokens_per_second_per_gpu": 11001.41, "total_tokens": 685046706 }, { "epoch": 0.43367091772943234, "grad_norm": 0.8774480819702148, "learning_rate": 2e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6937, "tokens_per_second_per_gpu": 11057.2, "total_tokens": 685146942 }, { "epoch": 0.4337334333583396, "grad_norm": 0.9036903977394104, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6938, "tokens_per_second_per_gpu": 11167.44, "total_tokens": 685245177 }, { "epoch": 0.4337959489872468, "grad_norm": 0.8866709470748901, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6939, "tokens_per_second_per_gpu": 9811.84, "total_tokens": 685341896 }, { "epoch": 0.433858464616154, "grad_norm": 0.8959676027297974, "learning_rate": 2e-05, "loss": 0.6635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6940, "tokens_per_second_per_gpu": 11181.93, "total_tokens": 685445573 }, { "epoch": 0.43392098024506126, "grad_norm": 0.8682245016098022, "learning_rate": 2e-05, "loss": 0.5853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6941, "tokens_per_second_per_gpu": 10066.32, "total_tokens": 685540893 }, { "epoch": 0.4339834958739685, "grad_norm": 0.8873187899589539, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6942, "tokens_per_second_per_gpu": 10413.91, "total_tokens": 685641244 }, { "epoch": 0.4340460115028757, "grad_norm": 0.8923372030258179, "learning_rate": 2e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6943, "tokens_per_second_per_gpu": 10221.42, "total_tokens": 685739751 }, { "epoch": 0.43410852713178294, "grad_norm": 0.8945716023445129, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6944, "tokens_per_second_per_gpu": 10142.52, "total_tokens": 685839733 }, { "epoch": 0.4341710427606902, "grad_norm": 0.893252968788147, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6945, "tokens_per_second_per_gpu": 10260.95, "total_tokens": 685941262 }, { "epoch": 0.4342335583895974, "grad_norm": 0.9152304530143738, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6946, "tokens_per_second_per_gpu": 10825.52, "total_tokens": 686041619 }, { "epoch": 0.4342960740185046, "grad_norm": 0.8913084268569946, "learning_rate": 2e-05, "loss": 0.6701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6947, "tokens_per_second_per_gpu": 10191.1, "total_tokens": 686136862 }, { "epoch": 0.43435858964741186, "grad_norm": 0.9344972968101501, "learning_rate": 2e-05, "loss": 0.7307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6948, "tokens_per_second_per_gpu": 10264.01, "total_tokens": 686235165 }, { "epoch": 0.43442110527631905, "grad_norm": 0.8707529306411743, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6949, "tokens_per_second_per_gpu": 10492.29, "total_tokens": 686334272 }, { "epoch": 0.4344836209052263, "grad_norm": 0.8973826766014099, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6950, "tokens_per_second_per_gpu": 10294.38, "total_tokens": 686433271 }, { "epoch": 0.43454613653413354, "grad_norm": 0.9001959562301636, "learning_rate": 2e-05, "loss": 0.6729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6951, "tokens_per_second_per_gpu": 9702.05, "total_tokens": 686528789 }, { "epoch": 0.4346086521630408, "grad_norm": 0.9208841919898987, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6952, "tokens_per_second_per_gpu": 10764.7, "total_tokens": 686630357 }, { "epoch": 0.434671167791948, "grad_norm": 0.8801277279853821, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6953, "tokens_per_second_per_gpu": 10572.82, "total_tokens": 686731888 }, { "epoch": 0.4347336834208552, "grad_norm": 0.8530187010765076, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6954, "tokens_per_second_per_gpu": 11164.81, "total_tokens": 686835124 }, { "epoch": 0.43479619904976247, "grad_norm": 0.8685469627380371, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6955, "tokens_per_second_per_gpu": 10235.74, "total_tokens": 686935539 }, { "epoch": 0.43485871467866966, "grad_norm": 0.8794047236442566, "learning_rate": 2e-05, "loss": 0.6297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6956, "tokens_per_second_per_gpu": 10933.12, "total_tokens": 687037215 }, { "epoch": 0.4349212303075769, "grad_norm": 0.8713026642799377, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6957, "tokens_per_second_per_gpu": 10161.4, "total_tokens": 687132773 }, { "epoch": 0.43498374593648415, "grad_norm": 0.8544425964355469, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6958, "tokens_per_second_per_gpu": 11168.53, "total_tokens": 687235013 }, { "epoch": 0.43504626156539133, "grad_norm": 0.8724835515022278, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6959, "tokens_per_second_per_gpu": 10926.92, "total_tokens": 687334042 }, { "epoch": 0.4351087771942986, "grad_norm": 0.8833666443824768, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6960, "tokens_per_second_per_gpu": 10220.13, "total_tokens": 687431160 }, { "epoch": 0.4351712928232058, "grad_norm": 0.8518242835998535, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6961, "tokens_per_second_per_gpu": 11125.98, "total_tokens": 687531851 }, { "epoch": 0.435233808452113, "grad_norm": 0.9141445159912109, "learning_rate": 2e-05, "loss": 0.72, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6962, "tokens_per_second_per_gpu": 10136.51, "total_tokens": 687630672 }, { "epoch": 0.43529632408102026, "grad_norm": 0.9392234683036804, "learning_rate": 2e-05, "loss": 0.674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6963, "tokens_per_second_per_gpu": 10216.64, "total_tokens": 687729848 }, { "epoch": 0.4353588397099275, "grad_norm": 0.9275689721107483, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6964, "tokens_per_second_per_gpu": 10369.25, "total_tokens": 687823802 }, { "epoch": 0.4354213553388347, "grad_norm": 0.9020716547966003, "learning_rate": 2e-05, "loss": 0.6931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6965, "tokens_per_second_per_gpu": 10530.61, "total_tokens": 687923797 }, { "epoch": 0.43548387096774194, "grad_norm": 0.9428019523620605, "learning_rate": 2e-05, "loss": 0.7393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6966, "tokens_per_second_per_gpu": 10791.0, "total_tokens": 688025205 }, { "epoch": 0.4355463865966492, "grad_norm": 0.8925796151161194, "learning_rate": 2e-05, "loss": 0.6851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6967, "tokens_per_second_per_gpu": 10677.15, "total_tokens": 688125274 }, { "epoch": 0.43560890222555637, "grad_norm": 0.8691924214363098, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6968, "tokens_per_second_per_gpu": 10139.16, "total_tokens": 688224122 }, { "epoch": 0.4356714178544636, "grad_norm": 0.8788156509399414, "learning_rate": 2e-05, "loss": 0.6936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6969, "tokens_per_second_per_gpu": 10907.71, "total_tokens": 688327400 }, { "epoch": 0.43573393348337086, "grad_norm": 0.9265073537826538, "learning_rate": 2e-05, "loss": 0.6518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6970, "tokens_per_second_per_gpu": 10838.75, "total_tokens": 688429842 }, { "epoch": 0.43579644911227805, "grad_norm": 0.8895459771156311, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6971, "tokens_per_second_per_gpu": 10429.19, "total_tokens": 688529560 }, { "epoch": 0.4358589647411853, "grad_norm": 0.9307700991630554, "learning_rate": 2e-05, "loss": 0.6462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6972, "tokens_per_second_per_gpu": 10635.76, "total_tokens": 688630009 }, { "epoch": 0.43592148037009254, "grad_norm": 0.8947626948356628, "learning_rate": 2e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6973, "tokens_per_second_per_gpu": 11155.14, "total_tokens": 688730843 }, { "epoch": 0.43598399599899973, "grad_norm": 0.9065396189689636, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6974, "tokens_per_second_per_gpu": 10255.32, "total_tokens": 688832007 }, { "epoch": 0.436046511627907, "grad_norm": 0.9041352868080139, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6975, "tokens_per_second_per_gpu": 10975.51, "total_tokens": 688929710 }, { "epoch": 0.4361090272568142, "grad_norm": 0.9023439884185791, "learning_rate": 2e-05, "loss": 0.657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6976, "tokens_per_second_per_gpu": 10655.97, "total_tokens": 689031433 }, { "epoch": 0.4361715428857214, "grad_norm": 0.8917027115821838, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6977, "tokens_per_second_per_gpu": 10291.47, "total_tokens": 689130115 }, { "epoch": 0.43623405851462865, "grad_norm": 0.881065309047699, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6978, "tokens_per_second_per_gpu": 10864.34, "total_tokens": 689232425 }, { "epoch": 0.4362965741435359, "grad_norm": 0.8420377373695374, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6979, "tokens_per_second_per_gpu": 10751.33, "total_tokens": 689331070 }, { "epoch": 0.4363590897724431, "grad_norm": 0.8824722170829773, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6980, "tokens_per_second_per_gpu": 10476.88, "total_tokens": 689433476 }, { "epoch": 0.43642160540135033, "grad_norm": 0.8703944683074951, "learning_rate": 2e-05, "loss": 0.6788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6981, "tokens_per_second_per_gpu": 11230.59, "total_tokens": 689538038 }, { "epoch": 0.4364841210302576, "grad_norm": 0.9238203763961792, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6982, "tokens_per_second_per_gpu": 10336.83, "total_tokens": 689632498 }, { "epoch": 0.43654663665916477, "grad_norm": 0.9214573502540588, "learning_rate": 2e-05, "loss": 0.6866, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6983, "tokens_per_second_per_gpu": 10333.47, "total_tokens": 689730746 }, { "epoch": 0.436609152288072, "grad_norm": 0.8902478814125061, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6984, "tokens_per_second_per_gpu": 10056.35, "total_tokens": 689831986 }, { "epoch": 0.43667166791697926, "grad_norm": 0.9233885407447815, "learning_rate": 2e-05, "loss": 0.6195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6985, "tokens_per_second_per_gpu": 10114.7, "total_tokens": 689931537 }, { "epoch": 0.43673418354588645, "grad_norm": 0.9039666652679443, "learning_rate": 2e-05, "loss": 0.6883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6986, "tokens_per_second_per_gpu": 11150.52, "total_tokens": 690033585 }, { "epoch": 0.4367966991747937, "grad_norm": 0.9167495369911194, "learning_rate": 2e-05, "loss": 0.6511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6987, "tokens_per_second_per_gpu": 10308.9, "total_tokens": 690130917 }, { "epoch": 0.43685921480370093, "grad_norm": 0.8796223998069763, "learning_rate": 2e-05, "loss": 0.6247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6988, "tokens_per_second_per_gpu": 10933.55, "total_tokens": 690233234 }, { "epoch": 0.4369217304326082, "grad_norm": 0.8728172183036804, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6989, "tokens_per_second_per_gpu": 10943.93, "total_tokens": 690335677 }, { "epoch": 0.43698424606151537, "grad_norm": 0.8714851140975952, "learning_rate": 2e-05, "loss": 0.6602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6990, "tokens_per_second_per_gpu": 11143.02, "total_tokens": 690438241 }, { "epoch": 0.4370467616904226, "grad_norm": 0.9086534380912781, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6991, "tokens_per_second_per_gpu": 9544.92, "total_tokens": 690534088 }, { "epoch": 0.43710927731932986, "grad_norm": 0.8663797378540039, "learning_rate": 2e-05, "loss": 0.6412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6992, "tokens_per_second_per_gpu": 10655.17, "total_tokens": 690629765 }, { "epoch": 0.43717179294823705, "grad_norm": 0.8716017007827759, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6993, "tokens_per_second_per_gpu": 10462.92, "total_tokens": 690724206 }, { "epoch": 0.4372343085771443, "grad_norm": 0.8928051590919495, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6994, "tokens_per_second_per_gpu": 10229.7, "total_tokens": 690823905 }, { "epoch": 0.43729682420605154, "grad_norm": 0.8813943266868591, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6995, "tokens_per_second_per_gpu": 10260.32, "total_tokens": 690923621 }, { "epoch": 0.4373593398349587, "grad_norm": 0.8842306733131409, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6996, "tokens_per_second_per_gpu": 10849.58, "total_tokens": 691024704 }, { "epoch": 0.43742185546386597, "grad_norm": 0.8853808045387268, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6997, "tokens_per_second_per_gpu": 10268.33, "total_tokens": 691124033 }, { "epoch": 0.4374843710927732, "grad_norm": 0.8723185658454895, "learning_rate": 2e-05, "loss": 0.6219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6998, "tokens_per_second_per_gpu": 10181.72, "total_tokens": 691223037 }, { "epoch": 0.4375468867216804, "grad_norm": 0.8684550523757935, "learning_rate": 2e-05, "loss": 0.6406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 6999, "tokens_per_second_per_gpu": 10540.88, "total_tokens": 691320041 }, { "epoch": 0.43760940235058765, "grad_norm": 0.8726896047592163, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7000, "tokens_per_second_per_gpu": 10479.94, "total_tokens": 691419021 }, { "epoch": 0.4376719179794949, "grad_norm": 0.8957680463790894, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7001, "tokens_per_second_per_gpu": 9709.49, "total_tokens": 691510983 }, { "epoch": 0.4377344336084021, "grad_norm": 0.890516996383667, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7002, "tokens_per_second_per_gpu": 9538.5, "total_tokens": 691603487 }, { "epoch": 0.43779694923730933, "grad_norm": 0.8758320212364197, "learning_rate": 2e-05, "loss": 0.5755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7003, "tokens_per_second_per_gpu": 9286.4, "total_tokens": 691694701 }, { "epoch": 0.4378594648662166, "grad_norm": 0.8770759105682373, "learning_rate": 2e-05, "loss": 0.6425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7004, "tokens_per_second_per_gpu": 10243.52, "total_tokens": 691790727 }, { "epoch": 0.43792198049512376, "grad_norm": 0.9295414686203003, "learning_rate": 2e-05, "loss": 0.6768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7005, "tokens_per_second_per_gpu": 10709.88, "total_tokens": 691891326 }, { "epoch": 0.437984496124031, "grad_norm": 0.8842893242835999, "learning_rate": 2e-05, "loss": 0.6614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7006, "tokens_per_second_per_gpu": 10333.39, "total_tokens": 691988207 }, { "epoch": 0.43804701175293825, "grad_norm": 0.9054988026618958, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7007, "tokens_per_second_per_gpu": 10683.6, "total_tokens": 692086330 }, { "epoch": 0.43810952738184544, "grad_norm": 0.9046687483787537, "learning_rate": 2e-05, "loss": 0.6991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7008, "tokens_per_second_per_gpu": 10062.05, "total_tokens": 692185685 }, { "epoch": 0.4381720430107527, "grad_norm": 0.8560109734535217, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7009, "tokens_per_second_per_gpu": 10459.68, "total_tokens": 692287684 }, { "epoch": 0.43823455863965993, "grad_norm": 0.9204574227333069, "learning_rate": 2e-05, "loss": 0.6768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7010, "tokens_per_second_per_gpu": 10073.72, "total_tokens": 692386979 }, { "epoch": 0.4382970742685671, "grad_norm": 0.8948124647140503, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7011, "tokens_per_second_per_gpu": 10351.61, "total_tokens": 692482289 }, { "epoch": 0.43835958989747437, "grad_norm": 0.9037637710571289, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7012, "tokens_per_second_per_gpu": 10360.68, "total_tokens": 692578698 }, { "epoch": 0.4384221055263816, "grad_norm": 0.8907397985458374, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7013, "tokens_per_second_per_gpu": 10233.21, "total_tokens": 692671975 }, { "epoch": 0.4384846211552888, "grad_norm": 0.8890507221221924, "learning_rate": 2e-05, "loss": 0.6355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7014, "tokens_per_second_per_gpu": 10475.77, "total_tokens": 692764786 }, { "epoch": 0.43854713678419605, "grad_norm": 0.8631757497787476, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7015, "tokens_per_second_per_gpu": 10914.63, "total_tokens": 692864868 }, { "epoch": 0.4386096524131033, "grad_norm": 0.913858950138092, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7016, "tokens_per_second_per_gpu": 9554.69, "total_tokens": 692956201 }, { "epoch": 0.4386721680420105, "grad_norm": 0.9158677458763123, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7017, "tokens_per_second_per_gpu": 10139.06, "total_tokens": 693051378 }, { "epoch": 0.4387346836709177, "grad_norm": 0.9145834445953369, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7018, "tokens_per_second_per_gpu": 9477.48, "total_tokens": 693148759 }, { "epoch": 0.43879719929982497, "grad_norm": 0.9029538035392761, "learning_rate": 2e-05, "loss": 0.6615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7019, "tokens_per_second_per_gpu": 10363.01, "total_tokens": 693246233 }, { "epoch": 0.43885971492873216, "grad_norm": 0.972280740737915, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7020, "tokens_per_second_per_gpu": 10691.12, "total_tokens": 693340867 }, { "epoch": 0.4389222305576394, "grad_norm": 0.9381400942802429, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7021, "tokens_per_second_per_gpu": 10784.72, "total_tokens": 693434901 }, { "epoch": 0.43898474618654665, "grad_norm": 0.9040526151657104, "learning_rate": 2e-05, "loss": 0.6701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7022, "tokens_per_second_per_gpu": 9919.27, "total_tokens": 693532792 }, { "epoch": 0.43904726181545384, "grad_norm": 0.8708069920539856, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7023, "tokens_per_second_per_gpu": 10548.65, "total_tokens": 693630045 }, { "epoch": 0.4391097774443611, "grad_norm": 0.8859361410140991, "learning_rate": 2e-05, "loss": 0.718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7024, "tokens_per_second_per_gpu": 11282.38, "total_tokens": 693732136 }, { "epoch": 0.4391722930732683, "grad_norm": 0.9769240021705627, "learning_rate": 2e-05, "loss": 0.6672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7025, "tokens_per_second_per_gpu": 10066.87, "total_tokens": 693828515 }, { "epoch": 0.4392348087021755, "grad_norm": 0.908552885055542, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7026, "tokens_per_second_per_gpu": 11193.76, "total_tokens": 693927455 }, { "epoch": 0.43929732433108276, "grad_norm": 0.8649091124534607, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7027, "tokens_per_second_per_gpu": 10282.24, "total_tokens": 694026677 }, { "epoch": 0.43935983995999, "grad_norm": 0.8924229741096497, "learning_rate": 2e-05, "loss": 0.6522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7028, "tokens_per_second_per_gpu": 10484.49, "total_tokens": 694126243 }, { "epoch": 0.43942235558889725, "grad_norm": 0.9012399315834045, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7029, "tokens_per_second_per_gpu": 11106.07, "total_tokens": 694228588 }, { "epoch": 0.43948487121780444, "grad_norm": 0.9044739603996277, "learning_rate": 2e-05, "loss": 0.6551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7030, "tokens_per_second_per_gpu": 10102.9, "total_tokens": 694325676 }, { "epoch": 0.4395473868467117, "grad_norm": 0.9126173853874207, "learning_rate": 2e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7031, "tokens_per_second_per_gpu": 10944.57, "total_tokens": 694422889 }, { "epoch": 0.43960990247561893, "grad_norm": 0.8815016746520996, "learning_rate": 2e-05, "loss": 0.686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7032, "tokens_per_second_per_gpu": 11081.38, "total_tokens": 694523365 }, { "epoch": 0.4396724181045261, "grad_norm": 0.9345031976699829, "learning_rate": 2e-05, "loss": 0.6976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7033, "tokens_per_second_per_gpu": 10773.74, "total_tokens": 694623894 }, { "epoch": 0.43973493373343336, "grad_norm": 0.9398226141929626, "learning_rate": 2e-05, "loss": 0.6991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7034, "tokens_per_second_per_gpu": 11295.88, "total_tokens": 694728472 }, { "epoch": 0.4397974493623406, "grad_norm": 0.8639299869537354, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7035, "tokens_per_second_per_gpu": 10769.32, "total_tokens": 694828263 }, { "epoch": 0.4398599649912478, "grad_norm": 0.8725041747093201, "learning_rate": 2e-05, "loss": 0.6412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7036, "tokens_per_second_per_gpu": 10653.44, "total_tokens": 694928100 }, { "epoch": 0.43992248062015504, "grad_norm": 0.8999530076980591, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7037, "tokens_per_second_per_gpu": 10838.84, "total_tokens": 695028091 }, { "epoch": 0.4399849962490623, "grad_norm": 0.8897904753684998, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7038, "tokens_per_second_per_gpu": 9949.11, "total_tokens": 695124414 }, { "epoch": 0.4400475118779695, "grad_norm": 0.901992678642273, "learning_rate": 2e-05, "loss": 0.6802, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7039, "tokens_per_second_per_gpu": 10900.39, "total_tokens": 695227257 }, { "epoch": 0.4401100275068767, "grad_norm": 0.8659164905548096, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7040, "tokens_per_second_per_gpu": 11135.59, "total_tokens": 695325333 }, { "epoch": 0.44017254313578397, "grad_norm": 0.8851075768470764, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7041, "tokens_per_second_per_gpu": 10568.54, "total_tokens": 695424902 }, { "epoch": 0.44023505876469116, "grad_norm": 0.9317925572395325, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7042, "tokens_per_second_per_gpu": 9972.38, "total_tokens": 695518948 }, { "epoch": 0.4402975743935984, "grad_norm": 0.9023717045783997, "learning_rate": 2e-05, "loss": 0.6764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7043, "tokens_per_second_per_gpu": 11412.55, "total_tokens": 695617994 }, { "epoch": 0.44036009002250565, "grad_norm": 0.8783659934997559, "learning_rate": 2e-05, "loss": 0.7042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7044, "tokens_per_second_per_gpu": 11804.04, "total_tokens": 695721901 }, { "epoch": 0.44042260565141284, "grad_norm": 0.8808799982070923, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7045, "tokens_per_second_per_gpu": 11249.27, "total_tokens": 695827673 }, { "epoch": 0.4404851212803201, "grad_norm": 0.8910269141197205, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7046, "tokens_per_second_per_gpu": 10138.88, "total_tokens": 695929812 }, { "epoch": 0.4405476369092273, "grad_norm": 0.8594407439231873, "learning_rate": 2e-05, "loss": 0.6825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7047, "tokens_per_second_per_gpu": 11091.27, "total_tokens": 696035978 }, { "epoch": 0.4406101525381345, "grad_norm": 0.9046370387077332, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7048, "tokens_per_second_per_gpu": 10154.49, "total_tokens": 696131516 }, { "epoch": 0.44067266816704176, "grad_norm": 0.9055845141410828, "learning_rate": 2e-05, "loss": 0.7202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7049, "tokens_per_second_per_gpu": 10639.86, "total_tokens": 696235284 }, { "epoch": 0.440735183795949, "grad_norm": 0.8997625112533569, "learning_rate": 2e-05, "loss": 0.6779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7050, "tokens_per_second_per_gpu": 10705.71, "total_tokens": 696336581 }, { "epoch": 0.4407976994248562, "grad_norm": 0.8834435343742371, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7051, "tokens_per_second_per_gpu": 11058.73, "total_tokens": 696436194 }, { "epoch": 0.44086021505376344, "grad_norm": 0.9267857074737549, "learning_rate": 2e-05, "loss": 0.6566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7052, "tokens_per_second_per_gpu": 10034.11, "total_tokens": 696534485 }, { "epoch": 0.4409227306826707, "grad_norm": 0.8628542423248291, "learning_rate": 2e-05, "loss": 0.6878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7053, "tokens_per_second_per_gpu": 11039.2, "total_tokens": 696638665 }, { "epoch": 0.44098524631157787, "grad_norm": 0.9287081956863403, "learning_rate": 2e-05, "loss": 0.6976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7054, "tokens_per_second_per_gpu": 10148.04, "total_tokens": 696736056 }, { "epoch": 0.4410477619404851, "grad_norm": 0.8905535340309143, "learning_rate": 2e-05, "loss": 0.6355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7055, "tokens_per_second_per_gpu": 10448.49, "total_tokens": 696832115 }, { "epoch": 0.44111027756939236, "grad_norm": 0.8617660999298096, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7056, "tokens_per_second_per_gpu": 10596.27, "total_tokens": 696931044 }, { "epoch": 0.44117279319829955, "grad_norm": 0.9249648451805115, "learning_rate": 2e-05, "loss": 0.6812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7057, "tokens_per_second_per_gpu": 10810.42, "total_tokens": 697030634 }, { "epoch": 0.4412353088272068, "grad_norm": 0.920680046081543, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7058, "tokens_per_second_per_gpu": 10592.49, "total_tokens": 697127368 }, { "epoch": 0.44129782445611404, "grad_norm": 0.9086510539054871, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7059, "tokens_per_second_per_gpu": 10450.07, "total_tokens": 697228182 }, { "epoch": 0.44136034008502123, "grad_norm": 0.8910081386566162, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7060, "tokens_per_second_per_gpu": 10113.25, "total_tokens": 697324871 }, { "epoch": 0.4414228557139285, "grad_norm": 0.8885026574134827, "learning_rate": 2e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7061, "tokens_per_second_per_gpu": 9379.65, "total_tokens": 697416283 }, { "epoch": 0.4414853713428357, "grad_norm": 0.8887032866477966, "learning_rate": 2e-05, "loss": 0.6317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7062, "tokens_per_second_per_gpu": 10876.83, "total_tokens": 697514455 }, { "epoch": 0.4415478869717429, "grad_norm": 0.9109359383583069, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7063, "tokens_per_second_per_gpu": 11072.73, "total_tokens": 697614332 }, { "epoch": 0.44161040260065015, "grad_norm": 0.919222891330719, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7064, "tokens_per_second_per_gpu": 11080.3, "total_tokens": 697712133 }, { "epoch": 0.4416729182295574, "grad_norm": 0.9329749345779419, "learning_rate": 2e-05, "loss": 0.6535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7065, "tokens_per_second_per_gpu": 10890.23, "total_tokens": 697808613 }, { "epoch": 0.44173543385846464, "grad_norm": 0.882612943649292, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7066, "tokens_per_second_per_gpu": 10698.67, "total_tokens": 697909994 }, { "epoch": 0.44179794948737183, "grad_norm": 0.9264366626739502, "learning_rate": 2e-05, "loss": 0.688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7067, "tokens_per_second_per_gpu": 10205.81, "total_tokens": 698006005 }, { "epoch": 0.4418604651162791, "grad_norm": 0.9009432792663574, "learning_rate": 2e-05, "loss": 0.6766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7068, "tokens_per_second_per_gpu": 10782.89, "total_tokens": 698110939 }, { "epoch": 0.4419229807451863, "grad_norm": 0.960353672504425, "learning_rate": 2e-05, "loss": 0.6618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7069, "tokens_per_second_per_gpu": 10444.87, "total_tokens": 698207387 }, { "epoch": 0.4419854963740935, "grad_norm": 0.8865508437156677, "learning_rate": 2e-05, "loss": 0.6734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7070, "tokens_per_second_per_gpu": 10344.14, "total_tokens": 698308697 }, { "epoch": 0.44204801200300076, "grad_norm": 0.9029401540756226, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7071, "tokens_per_second_per_gpu": 10579.43, "total_tokens": 698410930 }, { "epoch": 0.442110527631908, "grad_norm": 0.9060653448104858, "learning_rate": 2e-05, "loss": 0.6458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7072, "tokens_per_second_per_gpu": 10934.86, "total_tokens": 698511144 }, { "epoch": 0.4421730432608152, "grad_norm": 0.91575688123703, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7073, "tokens_per_second_per_gpu": 9835.82, "total_tokens": 698605799 }, { "epoch": 0.44223555888972244, "grad_norm": 0.9029085040092468, "learning_rate": 2e-05, "loss": 0.6516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7074, "tokens_per_second_per_gpu": 9705.79, "total_tokens": 698699062 }, { "epoch": 0.4422980745186297, "grad_norm": 0.932279109954834, "learning_rate": 2e-05, "loss": 0.6178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7075, "tokens_per_second_per_gpu": 10193.27, "total_tokens": 698791308 }, { "epoch": 0.44236059014753687, "grad_norm": 0.8977958559989929, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7076, "tokens_per_second_per_gpu": 11030.26, "total_tokens": 698891532 }, { "epoch": 0.4424231057764441, "grad_norm": 0.9109218120574951, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7077, "tokens_per_second_per_gpu": 11452.32, "total_tokens": 698994441 }, { "epoch": 0.44248562140535136, "grad_norm": 0.9023426175117493, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7078, "tokens_per_second_per_gpu": 10339.72, "total_tokens": 699092352 }, { "epoch": 0.44254813703425855, "grad_norm": 0.9067647457122803, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7079, "tokens_per_second_per_gpu": 10290.42, "total_tokens": 699188119 }, { "epoch": 0.4426106526631658, "grad_norm": 0.8907734751701355, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7080, "tokens_per_second_per_gpu": 10523.79, "total_tokens": 699284084 }, { "epoch": 0.44267316829207304, "grad_norm": 0.8709009885787964, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7081, "tokens_per_second_per_gpu": 11089.93, "total_tokens": 699386232 }, { "epoch": 0.4427356839209802, "grad_norm": 0.907536506652832, "learning_rate": 2e-05, "loss": 0.6867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7082, "tokens_per_second_per_gpu": 10674.58, "total_tokens": 699487278 }, { "epoch": 0.4427981995498875, "grad_norm": 0.9161652326583862, "learning_rate": 2e-05, "loss": 0.6468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7083, "tokens_per_second_per_gpu": 9648.47, "total_tokens": 699583539 }, { "epoch": 0.4428607151787947, "grad_norm": 0.8910559415817261, "learning_rate": 2e-05, "loss": 0.6518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7084, "tokens_per_second_per_gpu": 10881.1, "total_tokens": 699686722 }, { "epoch": 0.4429232308077019, "grad_norm": 0.8655469417572021, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7085, "tokens_per_second_per_gpu": 10731.82, "total_tokens": 699788138 }, { "epoch": 0.44298574643660915, "grad_norm": 0.8809127807617188, "learning_rate": 2e-05, "loss": 0.6575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7086, "tokens_per_second_per_gpu": 10506.54, "total_tokens": 699886826 }, { "epoch": 0.4430482620655164, "grad_norm": 0.9618892073631287, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7087, "tokens_per_second_per_gpu": 10642.71, "total_tokens": 699982531 }, { "epoch": 0.4431107776944236, "grad_norm": 0.8693593144416809, "learning_rate": 2e-05, "loss": 0.5808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7088, "tokens_per_second_per_gpu": 9654.54, "total_tokens": 700073927 }, { "epoch": 0.44317329332333083, "grad_norm": 0.9268360733985901, "learning_rate": 2e-05, "loss": 0.6965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7089, "tokens_per_second_per_gpu": 10539.31, "total_tokens": 700173631 }, { "epoch": 0.4432358089522381, "grad_norm": 0.8713095784187317, "learning_rate": 2e-05, "loss": 0.672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7090, "tokens_per_second_per_gpu": 10900.95, "total_tokens": 700275721 }, { "epoch": 0.44329832458114526, "grad_norm": 0.8617590069770813, "learning_rate": 2e-05, "loss": 0.6899, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7091, "tokens_per_second_per_gpu": 11037.68, "total_tokens": 700377294 }, { "epoch": 0.4433608402100525, "grad_norm": 0.920636773109436, "learning_rate": 2e-05, "loss": 0.6554, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7092, "tokens_per_second_per_gpu": 10885.89, "total_tokens": 700474830 }, { "epoch": 0.44342335583895975, "grad_norm": 0.8999167680740356, "learning_rate": 2e-05, "loss": 0.6604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7093, "tokens_per_second_per_gpu": 10777.23, "total_tokens": 700574116 }, { "epoch": 0.44348587146786694, "grad_norm": 0.8889959454536438, "learning_rate": 2e-05, "loss": 0.6683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7094, "tokens_per_second_per_gpu": 11216.97, "total_tokens": 700676366 }, { "epoch": 0.4435483870967742, "grad_norm": 0.893409252166748, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7095, "tokens_per_second_per_gpu": 10712.88, "total_tokens": 700776760 }, { "epoch": 0.44361090272568143, "grad_norm": 0.9281811118125916, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7096, "tokens_per_second_per_gpu": 10448.05, "total_tokens": 700874653 }, { "epoch": 0.4436734183545886, "grad_norm": 0.9024391174316406, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7097, "tokens_per_second_per_gpu": 10164.14, "total_tokens": 700971358 }, { "epoch": 0.44373593398349587, "grad_norm": 0.9636861085891724, "learning_rate": 2e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7098, "tokens_per_second_per_gpu": 10911.51, "total_tokens": 701069337 }, { "epoch": 0.4437984496124031, "grad_norm": 0.8829118013381958, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7099, "tokens_per_second_per_gpu": 11375.44, "total_tokens": 701171887 }, { "epoch": 0.4438609652413103, "grad_norm": 0.9714141488075256, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7100, "tokens_per_second_per_gpu": 10178.47, "total_tokens": 701268725 }, { "epoch": 0.44392348087021755, "grad_norm": 0.9380256533622742, "learning_rate": 2e-05, "loss": 0.6821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7101, "tokens_per_second_per_gpu": 9830.33, "total_tokens": 701364296 }, { "epoch": 0.4439859964991248, "grad_norm": 0.8836371898651123, "learning_rate": 2e-05, "loss": 0.6467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7102, "tokens_per_second_per_gpu": 10916.45, "total_tokens": 701466946 }, { "epoch": 0.44404851212803204, "grad_norm": 0.8721321225166321, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7103, "tokens_per_second_per_gpu": 10999.76, "total_tokens": 701566017 }, { "epoch": 0.4441110277569392, "grad_norm": 0.8858339190483093, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7104, "tokens_per_second_per_gpu": 11017.88, "total_tokens": 701664953 }, { "epoch": 0.44417354338584647, "grad_norm": 0.8840398788452148, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7105, "tokens_per_second_per_gpu": 10614.16, "total_tokens": 701766062 }, { "epoch": 0.4442360590147537, "grad_norm": 0.9160270690917969, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7106, "tokens_per_second_per_gpu": 11069.47, "total_tokens": 701864487 }, { "epoch": 0.4442985746436609, "grad_norm": 0.9137469530105591, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7107, "tokens_per_second_per_gpu": 9814.09, "total_tokens": 701959686 }, { "epoch": 0.44436109027256815, "grad_norm": 0.9135822057723999, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7108, "tokens_per_second_per_gpu": 9640.84, "total_tokens": 702054033 }, { "epoch": 0.4444236059014754, "grad_norm": 0.8834441900253296, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7109, "tokens_per_second_per_gpu": 10348.42, "total_tokens": 702153030 }, { "epoch": 0.4444861215303826, "grad_norm": 0.9313878417015076, "learning_rate": 2e-05, "loss": 0.6422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7110, "tokens_per_second_per_gpu": 10907.6, "total_tokens": 702252775 }, { "epoch": 0.44454863715928983, "grad_norm": 0.8837932348251343, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7111, "tokens_per_second_per_gpu": 11275.84, "total_tokens": 702350640 }, { "epoch": 0.4446111527881971, "grad_norm": 0.9448313117027283, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7112, "tokens_per_second_per_gpu": 10392.06, "total_tokens": 702447478 }, { "epoch": 0.44467366841710426, "grad_norm": 0.8826894760131836, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7113, "tokens_per_second_per_gpu": 11071.3, "total_tokens": 702550625 }, { "epoch": 0.4447361840460115, "grad_norm": 0.8834952116012573, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7114, "tokens_per_second_per_gpu": 11014.87, "total_tokens": 702648993 }, { "epoch": 0.44479869967491875, "grad_norm": 0.9053289294242859, "learning_rate": 2e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7115, "tokens_per_second_per_gpu": 10540.88, "total_tokens": 702745609 }, { "epoch": 0.44486121530382594, "grad_norm": 0.9202668070793152, "learning_rate": 2e-05, "loss": 0.6405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7116, "tokens_per_second_per_gpu": 9908.46, "total_tokens": 702843868 }, { "epoch": 0.4449237309327332, "grad_norm": 0.8879589438438416, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7117, "tokens_per_second_per_gpu": 10479.42, "total_tokens": 702945197 }, { "epoch": 0.44498624656164043, "grad_norm": 0.9203546047210693, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7118, "tokens_per_second_per_gpu": 9875.23, "total_tokens": 703039589 }, { "epoch": 0.4450487621905476, "grad_norm": 0.8846540451049805, "learning_rate": 2e-05, "loss": 0.6668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7119, "tokens_per_second_per_gpu": 10608.46, "total_tokens": 703140627 }, { "epoch": 0.44511127781945486, "grad_norm": 0.9289867281913757, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7120, "tokens_per_second_per_gpu": 10084.96, "total_tokens": 703235002 }, { "epoch": 0.4451737934483621, "grad_norm": 0.8916687369346619, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7121, "tokens_per_second_per_gpu": 10065.54, "total_tokens": 703331706 }, { "epoch": 0.4452363090772693, "grad_norm": 0.8978309035301208, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7122, "tokens_per_second_per_gpu": 10184.36, "total_tokens": 703426270 }, { "epoch": 0.44529882470617654, "grad_norm": 0.9043816924095154, "learning_rate": 2e-05, "loss": 0.6719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7123, "tokens_per_second_per_gpu": 10975.32, "total_tokens": 703524919 }, { "epoch": 0.4453613403350838, "grad_norm": 0.9064317941665649, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7124, "tokens_per_second_per_gpu": 10172.42, "total_tokens": 703619479 }, { "epoch": 0.445423855963991, "grad_norm": 0.922504723072052, "learning_rate": 2e-05, "loss": 0.6566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7125, "tokens_per_second_per_gpu": 10571.87, "total_tokens": 703716910 }, { "epoch": 0.4454863715928982, "grad_norm": 0.894266664981842, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7126, "tokens_per_second_per_gpu": 10860.43, "total_tokens": 703814940 }, { "epoch": 0.44554888722180547, "grad_norm": 0.9132824540138245, "learning_rate": 2e-05, "loss": 0.678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7127, "tokens_per_second_per_gpu": 14113.26, "total_tokens": 703912411 }, { "epoch": 0.44561140285071266, "grad_norm": 0.8795382976531982, "learning_rate": 2e-05, "loss": 0.6482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7128, "tokens_per_second_per_gpu": 10866.21, "total_tokens": 704013366 }, { "epoch": 0.4456739184796199, "grad_norm": 0.9050139784812927, "learning_rate": 2e-05, "loss": 0.6244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7129, "tokens_per_second_per_gpu": 10547.64, "total_tokens": 704108187 }, { "epoch": 0.44573643410852715, "grad_norm": 0.9107913374900818, "learning_rate": 2e-05, "loss": 0.7, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7130, "tokens_per_second_per_gpu": 11105.16, "total_tokens": 704208942 }, { "epoch": 0.44579894973743434, "grad_norm": 0.868880033493042, "learning_rate": 2e-05, "loss": 0.5857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7131, "tokens_per_second_per_gpu": 9560.59, "total_tokens": 704303932 }, { "epoch": 0.4458614653663416, "grad_norm": 0.8837324380874634, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7132, "tokens_per_second_per_gpu": 10298.86, "total_tokens": 704401695 }, { "epoch": 0.4459239809952488, "grad_norm": 0.9025899171829224, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7133, "tokens_per_second_per_gpu": 10905.68, "total_tokens": 704502675 }, { "epoch": 0.445986496624156, "grad_norm": 0.874199390411377, "learning_rate": 2e-05, "loss": 0.691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7134, "tokens_per_second_per_gpu": 10580.23, "total_tokens": 704603423 }, { "epoch": 0.44604901225306326, "grad_norm": 0.9338014125823975, "learning_rate": 2e-05, "loss": 0.7163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7135, "tokens_per_second_per_gpu": 10906.39, "total_tokens": 704704194 }, { "epoch": 0.4461115278819705, "grad_norm": 0.8927631378173828, "learning_rate": 2e-05, "loss": 0.7194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7136, "tokens_per_second_per_gpu": 10647.07, "total_tokens": 704806636 }, { "epoch": 0.4461740435108777, "grad_norm": 0.8706602454185486, "learning_rate": 2e-05, "loss": 0.593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7137, "tokens_per_second_per_gpu": 10429.03, "total_tokens": 704903083 }, { "epoch": 0.44623655913978494, "grad_norm": 0.8809493184089661, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7138, "tokens_per_second_per_gpu": 10917.99, "total_tokens": 705005414 }, { "epoch": 0.4462990747686922, "grad_norm": 0.8895435929298401, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7139, "tokens_per_second_per_gpu": 11171.53, "total_tokens": 705104079 }, { "epoch": 0.4463615903975994, "grad_norm": 0.9231353998184204, "learning_rate": 2e-05, "loss": 0.695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7140, "tokens_per_second_per_gpu": 9688.15, "total_tokens": 705200871 }, { "epoch": 0.4464241060265066, "grad_norm": 0.838904857635498, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7141, "tokens_per_second_per_gpu": 10891.18, "total_tokens": 705304197 }, { "epoch": 0.44648662165541386, "grad_norm": 0.8775500059127808, "learning_rate": 2e-05, "loss": 0.6921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7142, "tokens_per_second_per_gpu": 10884.76, "total_tokens": 705405000 }, { "epoch": 0.4465491372843211, "grad_norm": 0.8743927478790283, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7143, "tokens_per_second_per_gpu": 10712.33, "total_tokens": 705503149 }, { "epoch": 0.4466116529132283, "grad_norm": 0.8730848431587219, "learning_rate": 2e-05, "loss": 0.6552, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7144, "tokens_per_second_per_gpu": 10212.99, "total_tokens": 705601214 }, { "epoch": 0.44667416854213554, "grad_norm": 0.9184964299201965, "learning_rate": 2e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7145, "tokens_per_second_per_gpu": 10688.95, "total_tokens": 705701323 }, { "epoch": 0.4467366841710428, "grad_norm": 0.9208897352218628, "learning_rate": 2e-05, "loss": 0.657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7146, "tokens_per_second_per_gpu": 10741.74, "total_tokens": 705800074 }, { "epoch": 0.44679919979995, "grad_norm": 0.9134120345115662, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7147, "tokens_per_second_per_gpu": 10099.11, "total_tokens": 705894214 }, { "epoch": 0.4468617154288572, "grad_norm": 0.8784795999526978, "learning_rate": 2e-05, "loss": 0.6921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7148, "tokens_per_second_per_gpu": 10299.64, "total_tokens": 705993530 }, { "epoch": 0.44692423105776447, "grad_norm": 0.8920250535011292, "learning_rate": 2e-05, "loss": 0.6037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7149, "tokens_per_second_per_gpu": 9652.8, "total_tokens": 706086491 }, { "epoch": 0.44698674668667165, "grad_norm": 0.933541476726532, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7150, "tokens_per_second_per_gpu": 10812.84, "total_tokens": 706186386 }, { "epoch": 0.4470492623155789, "grad_norm": 0.9028757214546204, "learning_rate": 2e-05, "loss": 0.678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7151, "tokens_per_second_per_gpu": 11060.06, "total_tokens": 706285049 }, { "epoch": 0.44711177794448614, "grad_norm": 0.8662316203117371, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7152, "tokens_per_second_per_gpu": 10773.86, "total_tokens": 706387829 }, { "epoch": 0.44717429357339333, "grad_norm": 0.8611969351768494, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7153, "tokens_per_second_per_gpu": 11116.33, "total_tokens": 706489425 }, { "epoch": 0.4472368092023006, "grad_norm": 0.8815008401870728, "learning_rate": 2e-05, "loss": 0.6636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7154, "tokens_per_second_per_gpu": 10678.28, "total_tokens": 706588950 }, { "epoch": 0.4472993248312078, "grad_norm": 0.896152675151825, "learning_rate": 2e-05, "loss": 0.722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7155, "tokens_per_second_per_gpu": 10970.28, "total_tokens": 706688915 }, { "epoch": 0.447361840460115, "grad_norm": 0.8676891922950745, "learning_rate": 2e-05, "loss": 0.6435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7156, "tokens_per_second_per_gpu": 11240.76, "total_tokens": 706792892 }, { "epoch": 0.44742435608902226, "grad_norm": 0.956910252571106, "learning_rate": 2e-05, "loss": 0.6653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7157, "tokens_per_second_per_gpu": 9656.2, "total_tokens": 706889482 }, { "epoch": 0.4474868717179295, "grad_norm": 0.8737943172454834, "learning_rate": 2e-05, "loss": 0.6145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7158, "tokens_per_second_per_gpu": 10090.33, "total_tokens": 706986485 }, { "epoch": 0.4475493873468367, "grad_norm": 0.8663139343261719, "learning_rate": 2e-05, "loss": 0.6457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7159, "tokens_per_second_per_gpu": 10492.77, "total_tokens": 707085610 }, { "epoch": 0.44761190297574394, "grad_norm": 0.8775075078010559, "learning_rate": 2e-05, "loss": 0.6696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7160, "tokens_per_second_per_gpu": 11232.92, "total_tokens": 707186936 }, { "epoch": 0.4476744186046512, "grad_norm": 0.8906144499778748, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7161, "tokens_per_second_per_gpu": 9594.51, "total_tokens": 707280242 }, { "epoch": 0.44773693423355837, "grad_norm": 0.8672236800193787, "learning_rate": 2e-05, "loss": 0.6716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7162, "tokens_per_second_per_gpu": 10701.05, "total_tokens": 707381344 }, { "epoch": 0.4477994498624656, "grad_norm": 0.9027977585792542, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7163, "tokens_per_second_per_gpu": 9806.58, "total_tokens": 707475508 }, { "epoch": 0.44786196549137286, "grad_norm": 0.8691273927688599, "learning_rate": 2e-05, "loss": 0.5797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7164, "tokens_per_second_per_gpu": 9700.3, "total_tokens": 707569917 }, { "epoch": 0.44792448112028005, "grad_norm": 0.9021016955375671, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7165, "tokens_per_second_per_gpu": 10224.9, "total_tokens": 707665683 }, { "epoch": 0.4479869967491873, "grad_norm": 0.9219500422477722, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7166, "tokens_per_second_per_gpu": 10334.21, "total_tokens": 707761488 }, { "epoch": 0.44804951237809454, "grad_norm": 0.9246526956558228, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7167, "tokens_per_second_per_gpu": 10458.42, "total_tokens": 707859515 }, { "epoch": 0.44811202800700173, "grad_norm": 0.8939332365989685, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7168, "tokens_per_second_per_gpu": 10069.18, "total_tokens": 707955536 }, { "epoch": 0.448174543635909, "grad_norm": 0.9370980262756348, "learning_rate": 2e-05, "loss": 0.6919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7169, "tokens_per_second_per_gpu": 10728.44, "total_tokens": 708050023 }, { "epoch": 0.4482370592648162, "grad_norm": 0.9271583557128906, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7170, "tokens_per_second_per_gpu": 10904.81, "total_tokens": 708147171 }, { "epoch": 0.4482995748937234, "grad_norm": 0.8588523268699646, "learning_rate": 2e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7171, "tokens_per_second_per_gpu": 11035.39, "total_tokens": 708249918 }, { "epoch": 0.44836209052263065, "grad_norm": 0.8794572353363037, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7172, "tokens_per_second_per_gpu": 10169.94, "total_tokens": 708345176 }, { "epoch": 0.4484246061515379, "grad_norm": 0.8705317378044128, "learning_rate": 2e-05, "loss": 0.6726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7173, "tokens_per_second_per_gpu": 10954.85, "total_tokens": 708447128 }, { "epoch": 0.4484871217804451, "grad_norm": 0.9400911927223206, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7174, "tokens_per_second_per_gpu": 10230.75, "total_tokens": 708544795 }, { "epoch": 0.44854963740935233, "grad_norm": 0.9397023320198059, "learning_rate": 2e-05, "loss": 0.7175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7175, "tokens_per_second_per_gpu": 10982.99, "total_tokens": 708647282 }, { "epoch": 0.4486121530382596, "grad_norm": 0.9114899039268494, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7176, "tokens_per_second_per_gpu": 10153.96, "total_tokens": 708743933 }, { "epoch": 0.44867466866716676, "grad_norm": 0.9017017483711243, "learning_rate": 2e-05, "loss": 0.6915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7177, "tokens_per_second_per_gpu": 10468.81, "total_tokens": 708844086 }, { "epoch": 0.448737184296074, "grad_norm": 0.914875864982605, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7178, "tokens_per_second_per_gpu": 10591.33, "total_tokens": 708939706 }, { "epoch": 0.44879969992498125, "grad_norm": 0.9727948307991028, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7179, "tokens_per_second_per_gpu": 9304.64, "total_tokens": 709035012 }, { "epoch": 0.4488622155538885, "grad_norm": 0.8771048188209534, "learning_rate": 2e-05, "loss": 0.6482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7180, "tokens_per_second_per_gpu": 10964.56, "total_tokens": 709136650 }, { "epoch": 0.4489247311827957, "grad_norm": 0.8939985036849976, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7181, "tokens_per_second_per_gpu": 9924.79, "total_tokens": 709231630 }, { "epoch": 0.44898724681170293, "grad_norm": 0.8825680017471313, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7182, "tokens_per_second_per_gpu": 10337.33, "total_tokens": 709329520 }, { "epoch": 0.4490497624406102, "grad_norm": 0.9244505763053894, "learning_rate": 2e-05, "loss": 0.6534, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7183, "tokens_per_second_per_gpu": 10481.89, "total_tokens": 709429930 }, { "epoch": 0.44911227806951737, "grad_norm": 0.9256296753883362, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7184, "tokens_per_second_per_gpu": 10759.16, "total_tokens": 709526014 }, { "epoch": 0.4491747936984246, "grad_norm": 0.9544985294342041, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7185, "tokens_per_second_per_gpu": 10301.34, "total_tokens": 709619760 }, { "epoch": 0.44923730932733186, "grad_norm": 0.8824186325073242, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7186, "tokens_per_second_per_gpu": 10765.62, "total_tokens": 709720523 }, { "epoch": 0.44929982495623905, "grad_norm": 0.8966525793075562, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7187, "tokens_per_second_per_gpu": 10499.37, "total_tokens": 709817484 }, { "epoch": 0.4493623405851463, "grad_norm": 0.8868585824966431, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7188, "tokens_per_second_per_gpu": 10823.88, "total_tokens": 709913207 }, { "epoch": 0.44942485621405354, "grad_norm": 0.8836836218833923, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7189, "tokens_per_second_per_gpu": 10584.83, "total_tokens": 710012599 }, { "epoch": 0.4494873718429607, "grad_norm": 0.8678376078605652, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7190, "tokens_per_second_per_gpu": 10202.04, "total_tokens": 710110111 }, { "epoch": 0.44954988747186797, "grad_norm": 0.9052138924598694, "learning_rate": 2e-05, "loss": 0.6132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7191, "tokens_per_second_per_gpu": 10575.77, "total_tokens": 710208087 }, { "epoch": 0.4496124031007752, "grad_norm": 0.8726978898048401, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7192, "tokens_per_second_per_gpu": 10562.38, "total_tokens": 710308480 }, { "epoch": 0.4496749187296824, "grad_norm": 0.9260903596878052, "learning_rate": 2e-05, "loss": 0.6689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7193, "tokens_per_second_per_gpu": 10967.85, "total_tokens": 710407050 }, { "epoch": 0.44973743435858965, "grad_norm": 0.9183023571968079, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7194, "tokens_per_second_per_gpu": 10140.28, "total_tokens": 710502831 }, { "epoch": 0.4497999499874969, "grad_norm": 0.939067006111145, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7195, "tokens_per_second_per_gpu": 10017.88, "total_tokens": 710599324 }, { "epoch": 0.4498624656164041, "grad_norm": 0.8920615315437317, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7196, "tokens_per_second_per_gpu": 11419.08, "total_tokens": 710700676 }, { "epoch": 0.44992498124531133, "grad_norm": 0.9359158873558044, "learning_rate": 2e-05, "loss": 0.6877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7197, "tokens_per_second_per_gpu": 10393.14, "total_tokens": 710797210 }, { "epoch": 0.4499874968742186, "grad_norm": 0.8931920528411865, "learning_rate": 2e-05, "loss": 0.6704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7198, "tokens_per_second_per_gpu": 11661.02, "total_tokens": 710898100 }, { "epoch": 0.45005001250312576, "grad_norm": 0.9073492884635925, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7199, "tokens_per_second_per_gpu": 9668.4, "total_tokens": 710991948 }, { "epoch": 0.450112528132033, "grad_norm": 0.9107135534286499, "learning_rate": 2e-05, "loss": 0.6709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7200, "tokens_per_second_per_gpu": 10348.83, "total_tokens": 711091108 }, { "epoch": 0.45017504376094025, "grad_norm": 0.9585880637168884, "learning_rate": 2e-05, "loss": 0.6829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7201, "tokens_per_second_per_gpu": 9422.75, "total_tokens": 711183237 }, { "epoch": 0.45023755938984744, "grad_norm": 0.9081416726112366, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7202, "tokens_per_second_per_gpu": 10023.63, "total_tokens": 711269646 }, { "epoch": 0.4503000750187547, "grad_norm": 0.9373603463172913, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7203, "tokens_per_second_per_gpu": 10423.95, "total_tokens": 711366129 }, { "epoch": 0.45036259064766193, "grad_norm": 0.9377392530441284, "learning_rate": 2e-05, "loss": 0.693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7204, "tokens_per_second_per_gpu": 10846.02, "total_tokens": 711468727 }, { "epoch": 0.4504251062765691, "grad_norm": 0.9570655822753906, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7205, "tokens_per_second_per_gpu": 9255.64, "total_tokens": 711562377 }, { "epoch": 0.45048762190547637, "grad_norm": 0.9319568276405334, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7206, "tokens_per_second_per_gpu": 10701.08, "total_tokens": 711662884 }, { "epoch": 0.4505501375343836, "grad_norm": 0.925572395324707, "learning_rate": 2e-05, "loss": 0.6734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7207, "tokens_per_second_per_gpu": 10503.96, "total_tokens": 711757974 }, { "epoch": 0.4506126531632908, "grad_norm": 0.8499796986579895, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7208, "tokens_per_second_per_gpu": 9687.99, "total_tokens": 711854513 }, { "epoch": 0.45067516879219804, "grad_norm": 0.9186550974845886, "learning_rate": 2e-05, "loss": 0.6644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7209, "tokens_per_second_per_gpu": 9975.15, "total_tokens": 711947622 }, { "epoch": 0.4507376844211053, "grad_norm": 0.8873003125190735, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7210, "tokens_per_second_per_gpu": 10848.59, "total_tokens": 712049063 }, { "epoch": 0.4508002000500125, "grad_norm": 0.8641374111175537, "learning_rate": 2e-05, "loss": 0.6591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7211, "tokens_per_second_per_gpu": 11216.4, "total_tokens": 712152488 }, { "epoch": 0.4508627156789197, "grad_norm": 0.9574075937271118, "learning_rate": 2e-05, "loss": 0.676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7212, "tokens_per_second_per_gpu": 9924.53, "total_tokens": 712246795 }, { "epoch": 0.45092523130782697, "grad_norm": 0.8934913873672485, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7213, "tokens_per_second_per_gpu": 10936.64, "total_tokens": 712343290 }, { "epoch": 0.45098774693673416, "grad_norm": 0.9043043851852417, "learning_rate": 2e-05, "loss": 0.6773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7214, "tokens_per_second_per_gpu": 9863.1, "total_tokens": 712437358 }, { "epoch": 0.4510502625656414, "grad_norm": 0.9529708027839661, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7215, "tokens_per_second_per_gpu": 10518.54, "total_tokens": 712534386 }, { "epoch": 0.45111277819454865, "grad_norm": 0.9144620895385742, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7216, "tokens_per_second_per_gpu": 10692.91, "total_tokens": 712632288 }, { "epoch": 0.45117529382345584, "grad_norm": 0.9199972152709961, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7217, "tokens_per_second_per_gpu": 9246.56, "total_tokens": 712726656 }, { "epoch": 0.4512378094523631, "grad_norm": 0.9141101241111755, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7218, "tokens_per_second_per_gpu": 11028.2, "total_tokens": 712826455 }, { "epoch": 0.4513003250812703, "grad_norm": 0.9503135085105896, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7219, "tokens_per_second_per_gpu": 10283.83, "total_tokens": 712925312 }, { "epoch": 0.45136284071017757, "grad_norm": 0.9818016886711121, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7220, "tokens_per_second_per_gpu": 10529.23, "total_tokens": 713021525 }, { "epoch": 0.45142535633908476, "grad_norm": 0.8470000624656677, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7221, "tokens_per_second_per_gpu": 10632.29, "total_tokens": 713119512 }, { "epoch": 0.451487871967992, "grad_norm": 0.9168567657470703, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7222, "tokens_per_second_per_gpu": 10685.45, "total_tokens": 713215724 }, { "epoch": 0.45155038759689925, "grad_norm": 0.90024733543396, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7223, "tokens_per_second_per_gpu": 10541.41, "total_tokens": 713313755 }, { "epoch": 0.45161290322580644, "grad_norm": 0.8791796565055847, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7224, "tokens_per_second_per_gpu": 10711.68, "total_tokens": 713414021 }, { "epoch": 0.4516754188547137, "grad_norm": 0.8943783044815063, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7225, "tokens_per_second_per_gpu": 10239.54, "total_tokens": 713511343 }, { "epoch": 0.45173793448362093, "grad_norm": 0.8988677859306335, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7226, "tokens_per_second_per_gpu": 9586.4, "total_tokens": 713610581 }, { "epoch": 0.4518004501125281, "grad_norm": 0.8910345435142517, "learning_rate": 2e-05, "loss": 0.7051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7227, "tokens_per_second_per_gpu": 11096.19, "total_tokens": 713712776 }, { "epoch": 0.45186296574143536, "grad_norm": 0.8898983001708984, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7228, "tokens_per_second_per_gpu": 10050.68, "total_tokens": 713810282 }, { "epoch": 0.4519254813703426, "grad_norm": 0.9044360518455505, "learning_rate": 2e-05, "loss": 0.6879, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7229, "tokens_per_second_per_gpu": 11181.45, "total_tokens": 713915965 }, { "epoch": 0.4519879969992498, "grad_norm": 0.9017259478569031, "learning_rate": 2e-05, "loss": 0.6709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7230, "tokens_per_second_per_gpu": 10199.04, "total_tokens": 714011456 }, { "epoch": 0.45205051262815704, "grad_norm": 0.8809120059013367, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7231, "tokens_per_second_per_gpu": 10769.17, "total_tokens": 714110378 }, { "epoch": 0.4521130282570643, "grad_norm": 0.879221498966217, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7232, "tokens_per_second_per_gpu": 11070.12, "total_tokens": 714212620 }, { "epoch": 0.4521755438859715, "grad_norm": 0.9010623693466187, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7233, "tokens_per_second_per_gpu": 10660.88, "total_tokens": 714311686 }, { "epoch": 0.4522380595148787, "grad_norm": 0.9012585282325745, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7234, "tokens_per_second_per_gpu": 10417.08, "total_tokens": 714408347 }, { "epoch": 0.45230057514378597, "grad_norm": 0.9048435688018799, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7235, "tokens_per_second_per_gpu": 9582.44, "total_tokens": 714497596 }, { "epoch": 0.45236309077269315, "grad_norm": 0.9133062362670898, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7236, "tokens_per_second_per_gpu": 10638.71, "total_tokens": 714594942 }, { "epoch": 0.4524256064016004, "grad_norm": 0.9461898803710938, "learning_rate": 2e-05, "loss": 0.5839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7237, "tokens_per_second_per_gpu": 9101.4, "total_tokens": 714686654 }, { "epoch": 0.45248812203050764, "grad_norm": 0.8358024954795837, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7238, "tokens_per_second_per_gpu": 11565.37, "total_tokens": 714790281 }, { "epoch": 0.45255063765941483, "grad_norm": 0.8965557813644409, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7239, "tokens_per_second_per_gpu": 11150.38, "total_tokens": 714889615 }, { "epoch": 0.4526131532883221, "grad_norm": 0.9160778522491455, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7240, "tokens_per_second_per_gpu": 10014.94, "total_tokens": 714983434 }, { "epoch": 0.4526756689172293, "grad_norm": 0.9241043925285339, "learning_rate": 2e-05, "loss": 0.6718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7241, "tokens_per_second_per_gpu": 10234.37, "total_tokens": 715084314 }, { "epoch": 0.4527381845461365, "grad_norm": 0.9046474099159241, "learning_rate": 2e-05, "loss": 0.6507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7242, "tokens_per_second_per_gpu": 10377.42, "total_tokens": 715181432 }, { "epoch": 0.45280070017504376, "grad_norm": 0.9363265633583069, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7243, "tokens_per_second_per_gpu": 9337.24, "total_tokens": 715276133 }, { "epoch": 0.452863215803951, "grad_norm": 0.9117872714996338, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7244, "tokens_per_second_per_gpu": 10290.08, "total_tokens": 715372106 }, { "epoch": 0.4529257314328582, "grad_norm": 0.9632075428962708, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7245, "tokens_per_second_per_gpu": 10539.9, "total_tokens": 715473140 }, { "epoch": 0.45298824706176544, "grad_norm": 0.9425082802772522, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7246, "tokens_per_second_per_gpu": 9530.42, "total_tokens": 715567488 }, { "epoch": 0.4530507626906727, "grad_norm": 1.0073169469833374, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7247, "tokens_per_second_per_gpu": 10378.73, "total_tokens": 715664860 }, { "epoch": 0.45311327831957987, "grad_norm": 0.9800176024436951, "learning_rate": 2e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7248, "tokens_per_second_per_gpu": 11076.86, "total_tokens": 715760949 }, { "epoch": 0.4531757939484871, "grad_norm": 0.9415122866630554, "learning_rate": 2e-05, "loss": 0.6969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7249, "tokens_per_second_per_gpu": 9850.59, "total_tokens": 715855897 }, { "epoch": 0.45323830957739436, "grad_norm": 1.0068068504333496, "learning_rate": 2e-05, "loss": 0.6875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7250, "tokens_per_second_per_gpu": 11121.11, "total_tokens": 715955795 }, { "epoch": 0.45330082520630155, "grad_norm": 0.9120262265205383, "learning_rate": 2e-05, "loss": 0.6283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7251, "tokens_per_second_per_gpu": 10010.86, "total_tokens": 716054270 }, { "epoch": 0.4533633408352088, "grad_norm": 0.8833261132240295, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7252, "tokens_per_second_per_gpu": 11771.59, "total_tokens": 716152428 }, { "epoch": 0.45342585646411604, "grad_norm": 0.8614950180053711, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7253, "tokens_per_second_per_gpu": 10112.34, "total_tokens": 716249322 }, { "epoch": 0.45348837209302323, "grad_norm": 0.9201508164405823, "learning_rate": 2e-05, "loss": 0.6568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7254, "tokens_per_second_per_gpu": 10185.36, "total_tokens": 716343685 }, { "epoch": 0.4535508877219305, "grad_norm": 0.8994030356407166, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7255, "tokens_per_second_per_gpu": 9975.3, "total_tokens": 716440992 }, { "epoch": 0.4536134033508377, "grad_norm": 0.9738101363182068, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7256, "tokens_per_second_per_gpu": 10268.83, "total_tokens": 716536916 }, { "epoch": 0.45367591897974496, "grad_norm": 0.9050236940383911, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7257, "tokens_per_second_per_gpu": 10123.47, "total_tokens": 716634643 }, { "epoch": 0.45373843460865215, "grad_norm": 0.8705036640167236, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7258, "tokens_per_second_per_gpu": 10408.17, "total_tokens": 716732150 }, { "epoch": 0.4538009502375594, "grad_norm": 0.907965362071991, "learning_rate": 2e-05, "loss": 0.6615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7259, "tokens_per_second_per_gpu": 10490.37, "total_tokens": 716827960 }, { "epoch": 0.45386346586646664, "grad_norm": 0.8878414630889893, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7260, "tokens_per_second_per_gpu": 10988.3, "total_tokens": 716928143 }, { "epoch": 0.45392598149537383, "grad_norm": 0.9084601998329163, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7261, "tokens_per_second_per_gpu": 10278.99, "total_tokens": 717026965 }, { "epoch": 0.4539884971242811, "grad_norm": 0.9138191342353821, "learning_rate": 2e-05, "loss": 0.6721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7262, "tokens_per_second_per_gpu": 10084.22, "total_tokens": 717125501 }, { "epoch": 0.4540510127531883, "grad_norm": 0.9181207418441772, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7263, "tokens_per_second_per_gpu": 10451.87, "total_tokens": 717221164 }, { "epoch": 0.4541135283820955, "grad_norm": 0.9162507057189941, "learning_rate": 2e-05, "loss": 0.6696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7264, "tokens_per_second_per_gpu": 9759.32, "total_tokens": 717318091 }, { "epoch": 0.45417604401100276, "grad_norm": 0.9150890111923218, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7265, "tokens_per_second_per_gpu": 10415.76, "total_tokens": 717412127 }, { "epoch": 0.45423855963991, "grad_norm": 0.8960477113723755, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7266, "tokens_per_second_per_gpu": 9998.14, "total_tokens": 717508038 }, { "epoch": 0.4543010752688172, "grad_norm": 0.8758211135864258, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7267, "tokens_per_second_per_gpu": 10063.68, "total_tokens": 717605097 }, { "epoch": 0.45436359089772443, "grad_norm": 0.9561384320259094, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7268, "tokens_per_second_per_gpu": 9895.39, "total_tokens": 717697854 }, { "epoch": 0.4544261065266317, "grad_norm": 0.9118574857711792, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7269, "tokens_per_second_per_gpu": 10371.36, "total_tokens": 717793909 }, { "epoch": 0.45448862215553887, "grad_norm": 0.8730389475822449, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7270, "tokens_per_second_per_gpu": 11057.7, "total_tokens": 717894075 }, { "epoch": 0.4545511377844461, "grad_norm": 0.8764525055885315, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7271, "tokens_per_second_per_gpu": 10755.2, "total_tokens": 717993193 }, { "epoch": 0.45461365341335336, "grad_norm": 0.8927578926086426, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7272, "tokens_per_second_per_gpu": 10378.89, "total_tokens": 718094361 }, { "epoch": 0.45467616904226055, "grad_norm": 0.9815407395362854, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7273, "tokens_per_second_per_gpu": 10181.35, "total_tokens": 718190338 }, { "epoch": 0.4547386846711678, "grad_norm": 0.9263256192207336, "learning_rate": 2e-05, "loss": 0.6782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7274, "tokens_per_second_per_gpu": 10052.5, "total_tokens": 718284527 }, { "epoch": 0.45480120030007504, "grad_norm": 0.9408365488052368, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7275, "tokens_per_second_per_gpu": 9904.32, "total_tokens": 718380180 }, { "epoch": 0.4548637159289822, "grad_norm": 0.8646200299263, "learning_rate": 2e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7276, "tokens_per_second_per_gpu": 10312.49, "total_tokens": 718478460 }, { "epoch": 0.45492623155788947, "grad_norm": 0.9007613062858582, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7277, "tokens_per_second_per_gpu": 10208.17, "total_tokens": 718576832 }, { "epoch": 0.4549887471867967, "grad_norm": 0.9199528694152832, "learning_rate": 2e-05, "loss": 0.6669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7278, "tokens_per_second_per_gpu": 9991.92, "total_tokens": 718674094 }, { "epoch": 0.4550512628157039, "grad_norm": 0.8893802165985107, "learning_rate": 2e-05, "loss": 0.6598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7279, "tokens_per_second_per_gpu": 10458.44, "total_tokens": 718770548 }, { "epoch": 0.45511377844461115, "grad_norm": 0.9263593554496765, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7280, "tokens_per_second_per_gpu": 9276.05, "total_tokens": 718866206 }, { "epoch": 0.4551762940735184, "grad_norm": 0.9362300634384155, "learning_rate": 2e-05, "loss": 0.7044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7281, "tokens_per_second_per_gpu": 9924.55, "total_tokens": 718968094 }, { "epoch": 0.4552388097024256, "grad_norm": 0.9123566150665283, "learning_rate": 2e-05, "loss": 0.6508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7282, "tokens_per_second_per_gpu": 10378.16, "total_tokens": 719066537 }, { "epoch": 0.45530132533133283, "grad_norm": 0.927222490310669, "learning_rate": 2e-05, "loss": 0.5868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7283, "tokens_per_second_per_gpu": 10150.33, "total_tokens": 719155142 }, { "epoch": 0.4553638409602401, "grad_norm": 0.876557469367981, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7284, "tokens_per_second_per_gpu": 11013.71, "total_tokens": 719253980 }, { "epoch": 0.45542635658914726, "grad_norm": 0.8791829347610474, "learning_rate": 2e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7285, "tokens_per_second_per_gpu": 10552.04, "total_tokens": 719355019 }, { "epoch": 0.4554888722180545, "grad_norm": 0.9171221852302551, "learning_rate": 2e-05, "loss": 0.6654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7286, "tokens_per_second_per_gpu": 10758.52, "total_tokens": 719456242 }, { "epoch": 0.45555138784696175, "grad_norm": 0.8748297095298767, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7287, "tokens_per_second_per_gpu": 11106.1, "total_tokens": 719555525 }, { "epoch": 0.45561390347586894, "grad_norm": 0.8960646390914917, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7288, "tokens_per_second_per_gpu": 10529.79, "total_tokens": 719654094 }, { "epoch": 0.4556764191047762, "grad_norm": 0.8677330017089844, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7289, "tokens_per_second_per_gpu": 10081.52, "total_tokens": 719754130 }, { "epoch": 0.45573893473368343, "grad_norm": 0.8995880484580994, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7290, "tokens_per_second_per_gpu": 10323.78, "total_tokens": 719851414 }, { "epoch": 0.4558014503625906, "grad_norm": 0.9465535283088684, "learning_rate": 2e-05, "loss": 0.6996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7291, "tokens_per_second_per_gpu": 10609.07, "total_tokens": 719951502 }, { "epoch": 0.45586396599149787, "grad_norm": 0.9212701916694641, "learning_rate": 2e-05, "loss": 0.6541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7292, "tokens_per_second_per_gpu": 11107.92, "total_tokens": 720052449 }, { "epoch": 0.4559264816204051, "grad_norm": 0.8921207189559937, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7293, "tokens_per_second_per_gpu": 11080.92, "total_tokens": 720152043 }, { "epoch": 0.45598899724931236, "grad_norm": 0.9281430840492249, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7294, "tokens_per_second_per_gpu": 10872.38, "total_tokens": 720254447 }, { "epoch": 0.45605151287821954, "grad_norm": 0.9358526468276978, "learning_rate": 2e-05, "loss": 0.6591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7295, "tokens_per_second_per_gpu": 10947.88, "total_tokens": 720356984 }, { "epoch": 0.4561140285071268, "grad_norm": 0.939415454864502, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7296, "tokens_per_second_per_gpu": 10116.95, "total_tokens": 720454814 }, { "epoch": 0.45617654413603403, "grad_norm": 0.8650565147399902, "learning_rate": 2e-05, "loss": 0.6753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7297, "tokens_per_second_per_gpu": 11195.51, "total_tokens": 720558700 }, { "epoch": 0.4562390597649412, "grad_norm": 0.879335343837738, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7298, "tokens_per_second_per_gpu": 10935.7, "total_tokens": 720661147 }, { "epoch": 0.45630157539384847, "grad_norm": 0.8721733093261719, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7299, "tokens_per_second_per_gpu": 10631.75, "total_tokens": 720761994 }, { "epoch": 0.4563640910227557, "grad_norm": 0.8958504796028137, "learning_rate": 2e-05, "loss": 0.6825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7300, "tokens_per_second_per_gpu": 10821.33, "total_tokens": 720863049 }, { "epoch": 0.4564266066516629, "grad_norm": 0.8688865303993225, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7301, "tokens_per_second_per_gpu": 11031.71, "total_tokens": 720963715 }, { "epoch": 0.45648912228057015, "grad_norm": 0.8630439639091492, "learning_rate": 2e-05, "loss": 0.5979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7302, "tokens_per_second_per_gpu": 10092.26, "total_tokens": 721062012 }, { "epoch": 0.4565516379094774, "grad_norm": 0.8827049136161804, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7303, "tokens_per_second_per_gpu": 11132.88, "total_tokens": 721161605 }, { "epoch": 0.4566141535383846, "grad_norm": 0.8748756051063538, "learning_rate": 2e-05, "loss": 0.6541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7304, "tokens_per_second_per_gpu": 10640.58, "total_tokens": 721263547 }, { "epoch": 0.4566766691672918, "grad_norm": 0.8217324614524841, "learning_rate": 2e-05, "loss": 0.603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7305, "tokens_per_second_per_gpu": 10177.31, "total_tokens": 721360334 }, { "epoch": 0.45673918479619907, "grad_norm": 0.9298117160797119, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7306, "tokens_per_second_per_gpu": 11506.41, "total_tokens": 721459342 }, { "epoch": 0.45680170042510626, "grad_norm": 0.8960136771202087, "learning_rate": 2e-05, "loss": 0.678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7307, "tokens_per_second_per_gpu": 10978.59, "total_tokens": 721561520 }, { "epoch": 0.4568642160540135, "grad_norm": 0.9215466976165771, "learning_rate": 2e-05, "loss": 0.6612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7308, "tokens_per_second_per_gpu": 10242.92, "total_tokens": 721660675 }, { "epoch": 0.45692673168292075, "grad_norm": 0.8754458427429199, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7309, "tokens_per_second_per_gpu": 9311.42, "total_tokens": 721756113 }, { "epoch": 0.45698924731182794, "grad_norm": 0.8770872950553894, "learning_rate": 2e-05, "loss": 0.6682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7310, "tokens_per_second_per_gpu": 9998.75, "total_tokens": 721860311 }, { "epoch": 0.4570517629407352, "grad_norm": 0.8956592679023743, "learning_rate": 2e-05, "loss": 0.6843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7311, "tokens_per_second_per_gpu": 10197.42, "total_tokens": 721960372 }, { "epoch": 0.45711427856964243, "grad_norm": 0.9011630415916443, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7312, "tokens_per_second_per_gpu": 10741.69, "total_tokens": 722060520 }, { "epoch": 0.4571767941985496, "grad_norm": 0.9239224195480347, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7313, "tokens_per_second_per_gpu": 10203.43, "total_tokens": 722155534 }, { "epoch": 0.45723930982745686, "grad_norm": 0.8942326307296753, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7314, "tokens_per_second_per_gpu": 10496.35, "total_tokens": 722253445 }, { "epoch": 0.4573018254563641, "grad_norm": 0.9049120545387268, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7315, "tokens_per_second_per_gpu": 10369.76, "total_tokens": 722351171 }, { "epoch": 0.4573643410852713, "grad_norm": 0.8899158835411072, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7316, "tokens_per_second_per_gpu": 10402.49, "total_tokens": 722451722 }, { "epoch": 0.45742685671417854, "grad_norm": 0.8981859087944031, "learning_rate": 2e-05, "loss": 0.6405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7317, "tokens_per_second_per_gpu": 10030.93, "total_tokens": 722549283 }, { "epoch": 0.4574893723430858, "grad_norm": 0.8865955471992493, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7318, "tokens_per_second_per_gpu": 10310.38, "total_tokens": 722648092 }, { "epoch": 0.457551887971993, "grad_norm": 0.8846124410629272, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7319, "tokens_per_second_per_gpu": 10273.35, "total_tokens": 722744785 }, { "epoch": 0.4576144036009002, "grad_norm": 0.8850002884864807, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7320, "tokens_per_second_per_gpu": 10772.43, "total_tokens": 722844057 }, { "epoch": 0.45767691922980747, "grad_norm": 0.8812020421028137, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7321, "tokens_per_second_per_gpu": 10124.02, "total_tokens": 722941658 }, { "epoch": 0.45773943485871466, "grad_norm": 0.9016830325126648, "learning_rate": 2e-05, "loss": 0.6325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7322, "tokens_per_second_per_gpu": 10900.6, "total_tokens": 723039215 }, { "epoch": 0.4578019504876219, "grad_norm": 0.8650920391082764, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7323, "tokens_per_second_per_gpu": 10181.84, "total_tokens": 723136756 }, { "epoch": 0.45786446611652915, "grad_norm": 0.8867003917694092, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7324, "tokens_per_second_per_gpu": 10676.92, "total_tokens": 723235472 }, { "epoch": 0.45792698174543633, "grad_norm": 0.9006693363189697, "learning_rate": 2e-05, "loss": 0.6939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7325, "tokens_per_second_per_gpu": 10372.43, "total_tokens": 723335170 }, { "epoch": 0.4579894973743436, "grad_norm": 0.8806000351905823, "learning_rate": 2e-05, "loss": 0.6508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7326, "tokens_per_second_per_gpu": 11119.87, "total_tokens": 723437865 }, { "epoch": 0.4580520130032508, "grad_norm": 0.9380581974983215, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7327, "tokens_per_second_per_gpu": 10084.5, "total_tokens": 723536125 }, { "epoch": 0.458114528632158, "grad_norm": 0.8892961740493774, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7328, "tokens_per_second_per_gpu": 10040.37, "total_tokens": 723636149 }, { "epoch": 0.45817704426106526, "grad_norm": 0.8674980401992798, "learning_rate": 2e-05, "loss": 0.5937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7329, "tokens_per_second_per_gpu": 9707.97, "total_tokens": 723731576 }, { "epoch": 0.4582395598899725, "grad_norm": 0.8664132952690125, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7330, "tokens_per_second_per_gpu": 10329.74, "total_tokens": 723828122 }, { "epoch": 0.4583020755188797, "grad_norm": 0.9342005252838135, "learning_rate": 2e-05, "loss": 0.6906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7331, "tokens_per_second_per_gpu": 10926.36, "total_tokens": 723927072 }, { "epoch": 0.45836459114778694, "grad_norm": 0.8825257420539856, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7332, "tokens_per_second_per_gpu": 10474.79, "total_tokens": 724025229 }, { "epoch": 0.4584271067766942, "grad_norm": 0.8858168125152588, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7333, "tokens_per_second_per_gpu": 10414.21, "total_tokens": 724125248 }, { "epoch": 0.4584896224056014, "grad_norm": 0.9049124717712402, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7334, "tokens_per_second_per_gpu": 10947.44, "total_tokens": 724224977 }, { "epoch": 0.4585521380345086, "grad_norm": 0.902786135673523, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7335, "tokens_per_second_per_gpu": 10279.37, "total_tokens": 724319906 }, { "epoch": 0.45861465366341586, "grad_norm": 0.9045829772949219, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7336, "tokens_per_second_per_gpu": 9901.55, "total_tokens": 724416056 }, { "epoch": 0.4586771692923231, "grad_norm": 0.8919491767883301, "learning_rate": 2e-05, "loss": 0.6056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7337, "tokens_per_second_per_gpu": 10167.14, "total_tokens": 724511756 }, { "epoch": 0.4587396849212303, "grad_norm": 0.8839619755744934, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7338, "tokens_per_second_per_gpu": 11219.66, "total_tokens": 724611438 }, { "epoch": 0.45880220055013754, "grad_norm": 0.8734222054481506, "learning_rate": 2e-05, "loss": 0.6679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7339, "tokens_per_second_per_gpu": 10762.81, "total_tokens": 724713823 }, { "epoch": 0.4588647161790448, "grad_norm": 0.9263575673103333, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7340, "tokens_per_second_per_gpu": 11029.32, "total_tokens": 724814523 }, { "epoch": 0.458927231807952, "grad_norm": 0.9424033761024475, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7341, "tokens_per_second_per_gpu": 10213.58, "total_tokens": 724913003 }, { "epoch": 0.4589897474368592, "grad_norm": 0.9828806519508362, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7342, "tokens_per_second_per_gpu": 10375.34, "total_tokens": 725011581 }, { "epoch": 0.45905226306576646, "grad_norm": 0.9346844553947449, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7343, "tokens_per_second_per_gpu": 9752.45, "total_tokens": 725106221 }, { "epoch": 0.45911477869467365, "grad_norm": 0.9615757465362549, "learning_rate": 2e-05, "loss": 0.6507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7344, "tokens_per_second_per_gpu": 9645.35, "total_tokens": 725203711 }, { "epoch": 0.4591772943235809, "grad_norm": 0.934457004070282, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7345, "tokens_per_second_per_gpu": 10347.07, "total_tokens": 725299844 }, { "epoch": 0.45923980995248814, "grad_norm": 0.8412603735923767, "learning_rate": 2e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7346, "tokens_per_second_per_gpu": 10350.71, "total_tokens": 725397522 }, { "epoch": 0.45930232558139533, "grad_norm": 0.9240483641624451, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7347, "tokens_per_second_per_gpu": 10627.62, "total_tokens": 725497672 }, { "epoch": 0.4593648412103026, "grad_norm": 0.8787997364997864, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7348, "tokens_per_second_per_gpu": 10750.75, "total_tokens": 725599939 }, { "epoch": 0.4594273568392098, "grad_norm": 0.8888474702835083, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7349, "tokens_per_second_per_gpu": 9654.13, "total_tokens": 725695632 }, { "epoch": 0.459489872468117, "grad_norm": 0.8950501680374146, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7350, "tokens_per_second_per_gpu": 10939.6, "total_tokens": 725794349 }, { "epoch": 0.45955238809702426, "grad_norm": 0.9038364291191101, "learning_rate": 2e-05, "loss": 0.6731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7351, "tokens_per_second_per_gpu": 10425.19, "total_tokens": 725892322 }, { "epoch": 0.4596149037259315, "grad_norm": 0.9263175129890442, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7352, "tokens_per_second_per_gpu": 10041.14, "total_tokens": 725988749 }, { "epoch": 0.4596774193548387, "grad_norm": 0.8842488527297974, "learning_rate": 2e-05, "loss": 0.6747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7353, "tokens_per_second_per_gpu": 10381.5, "total_tokens": 726089472 }, { "epoch": 0.45973993498374593, "grad_norm": 0.8539533615112305, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7354, "tokens_per_second_per_gpu": 10612.08, "total_tokens": 726187406 }, { "epoch": 0.4598024506126532, "grad_norm": 0.936436116695404, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7355, "tokens_per_second_per_gpu": 10110.88, "total_tokens": 726283758 }, { "epoch": 0.45986496624156037, "grad_norm": 0.8807400465011597, "learning_rate": 2e-05, "loss": 0.6723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7356, "tokens_per_second_per_gpu": 11460.45, "total_tokens": 726386382 }, { "epoch": 0.4599274818704676, "grad_norm": 0.9013773202896118, "learning_rate": 2e-05, "loss": 0.6661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7357, "tokens_per_second_per_gpu": 11364.07, "total_tokens": 726486819 }, { "epoch": 0.45998999749937486, "grad_norm": 0.9374392628669739, "learning_rate": 2e-05, "loss": 0.6692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7358, "tokens_per_second_per_gpu": 9887.98, "total_tokens": 726583859 }, { "epoch": 0.46005251312828205, "grad_norm": 1.0055817365646362, "learning_rate": 2e-05, "loss": 0.7007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7359, "tokens_per_second_per_gpu": 10185.96, "total_tokens": 726684359 }, { "epoch": 0.4601150287571893, "grad_norm": 0.8493905067443848, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7360, "tokens_per_second_per_gpu": 10142.06, "total_tokens": 726784355 }, { "epoch": 0.46017754438609654, "grad_norm": 0.8579049706459045, "learning_rate": 2e-05, "loss": 0.5925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7361, "tokens_per_second_per_gpu": 9568.57, "total_tokens": 726879884 }, { "epoch": 0.4602400600150037, "grad_norm": 0.8359580636024475, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7362, "tokens_per_second_per_gpu": 10691.95, "total_tokens": 726979776 }, { "epoch": 0.46030257564391097, "grad_norm": 0.8841294050216675, "learning_rate": 2e-05, "loss": 0.668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7363, "tokens_per_second_per_gpu": 11253.33, "total_tokens": 727079436 }, { "epoch": 0.4603650912728182, "grad_norm": 0.9193941950798035, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7364, "tokens_per_second_per_gpu": 12484.33, "total_tokens": 727175164 }, { "epoch": 0.4604276069017254, "grad_norm": 0.8436521291732788, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7365, "tokens_per_second_per_gpu": 12433.48, "total_tokens": 727276618 }, { "epoch": 0.46049012253063265, "grad_norm": 0.9023586511611938, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7366, "tokens_per_second_per_gpu": 10421.53, "total_tokens": 727375017 }, { "epoch": 0.4605526381595399, "grad_norm": 0.8987757563591003, "learning_rate": 2e-05, "loss": 0.6627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7367, "tokens_per_second_per_gpu": 10637.11, "total_tokens": 727474436 }, { "epoch": 0.4606151537884471, "grad_norm": 0.8742930293083191, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7368, "tokens_per_second_per_gpu": 9794.39, "total_tokens": 727569173 }, { "epoch": 0.46067766941735433, "grad_norm": 0.8581012487411499, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7369, "tokens_per_second_per_gpu": 10857.28, "total_tokens": 727668915 }, { "epoch": 0.4607401850462616, "grad_norm": 0.8872167468070984, "learning_rate": 2e-05, "loss": 0.6575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7370, "tokens_per_second_per_gpu": 10019.52, "total_tokens": 727767721 }, { "epoch": 0.4608027006751688, "grad_norm": 0.8731235861778259, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7371, "tokens_per_second_per_gpu": 10925.11, "total_tokens": 727869961 }, { "epoch": 0.460865216304076, "grad_norm": 0.8855920433998108, "learning_rate": 2e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7372, "tokens_per_second_per_gpu": 10849.62, "total_tokens": 727973600 }, { "epoch": 0.46092773193298325, "grad_norm": 0.8852561116218567, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7373, "tokens_per_second_per_gpu": 10944.36, "total_tokens": 728071682 }, { "epoch": 0.4609902475618905, "grad_norm": 0.8631627559661865, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7374, "tokens_per_second_per_gpu": 10683.46, "total_tokens": 728171883 }, { "epoch": 0.4610527631907977, "grad_norm": 0.9051917791366577, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7375, "tokens_per_second_per_gpu": 9450.67, "total_tokens": 728266267 }, { "epoch": 0.46111527881970493, "grad_norm": 0.9415287971496582, "learning_rate": 2e-05, "loss": 0.7172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7376, "tokens_per_second_per_gpu": 11244.23, "total_tokens": 728370046 }, { "epoch": 0.4611777944486122, "grad_norm": 0.857174277305603, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7377, "tokens_per_second_per_gpu": 10877.38, "total_tokens": 728469719 }, { "epoch": 0.46124031007751937, "grad_norm": 0.8764782547950745, "learning_rate": 2e-05, "loss": 0.6592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7378, "tokens_per_second_per_gpu": 10227.94, "total_tokens": 728567688 }, { "epoch": 0.4613028257064266, "grad_norm": 0.9191486835479736, "learning_rate": 2e-05, "loss": 0.6904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7379, "tokens_per_second_per_gpu": 10999.44, "total_tokens": 728666064 }, { "epoch": 0.46136534133533386, "grad_norm": 0.8602408170700073, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7380, "tokens_per_second_per_gpu": 11014.58, "total_tokens": 728766527 }, { "epoch": 0.46142785696424105, "grad_norm": 0.895830512046814, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7381, "tokens_per_second_per_gpu": 9558.32, "total_tokens": 728862389 }, { "epoch": 0.4614903725931483, "grad_norm": 0.8864951729774475, "learning_rate": 2e-05, "loss": 0.6365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7382, "tokens_per_second_per_gpu": 10107.56, "total_tokens": 728961551 }, { "epoch": 0.46155288822205554, "grad_norm": 0.9175781607627869, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7383, "tokens_per_second_per_gpu": 10263.25, "total_tokens": 729057007 }, { "epoch": 0.4616154038509627, "grad_norm": 0.9749646782875061, "learning_rate": 2e-05, "loss": 0.6126, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7384, "tokens_per_second_per_gpu": 9917.52, "total_tokens": 729152368 }, { "epoch": 0.46167791947986997, "grad_norm": 0.876726508140564, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7385, "tokens_per_second_per_gpu": 10424.14, "total_tokens": 729251652 }, { "epoch": 0.4617404351087772, "grad_norm": 0.9205405712127686, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7386, "tokens_per_second_per_gpu": 9922.3, "total_tokens": 729341095 }, { "epoch": 0.4618029507376844, "grad_norm": 0.923498272895813, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7387, "tokens_per_second_per_gpu": 10515.88, "total_tokens": 729439398 }, { "epoch": 0.46186546636659165, "grad_norm": 0.9368107914924622, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7388, "tokens_per_second_per_gpu": 10634.79, "total_tokens": 729535956 }, { "epoch": 0.4619279819954989, "grad_norm": 0.866631269454956, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7389, "tokens_per_second_per_gpu": 10779.79, "total_tokens": 729635969 }, { "epoch": 0.4619904976244061, "grad_norm": 0.9041223526000977, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7390, "tokens_per_second_per_gpu": 10973.41, "total_tokens": 729735533 }, { "epoch": 0.4620530132533133, "grad_norm": 0.8747127056121826, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7391, "tokens_per_second_per_gpu": 10859.82, "total_tokens": 729839051 }, { "epoch": 0.46211552888222057, "grad_norm": 0.9192379117012024, "learning_rate": 2e-05, "loss": 0.6902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7392, "tokens_per_second_per_gpu": 10520.57, "total_tokens": 729938519 }, { "epoch": 0.46217804451112776, "grad_norm": 0.8824442028999329, "learning_rate": 2e-05, "loss": 0.67, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7393, "tokens_per_second_per_gpu": 10604.38, "total_tokens": 730037043 }, { "epoch": 0.462240560140035, "grad_norm": 0.9230514764785767, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7394, "tokens_per_second_per_gpu": 10605.07, "total_tokens": 730133895 }, { "epoch": 0.46230307576894225, "grad_norm": 0.8913416862487793, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7395, "tokens_per_second_per_gpu": 10550.05, "total_tokens": 730234532 }, { "epoch": 0.46236559139784944, "grad_norm": 0.9305736422538757, "learning_rate": 2e-05, "loss": 0.7122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7396, "tokens_per_second_per_gpu": 10593.93, "total_tokens": 730333873 }, { "epoch": 0.4624281070267567, "grad_norm": 0.9242508411407471, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7397, "tokens_per_second_per_gpu": 9803.21, "total_tokens": 730427544 }, { "epoch": 0.46249062265566393, "grad_norm": 0.8440622091293335, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7398, "tokens_per_second_per_gpu": 9082.54, "total_tokens": 730525166 }, { "epoch": 0.4625531382845711, "grad_norm": 0.8677139282226562, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7399, "tokens_per_second_per_gpu": 10029.72, "total_tokens": 730620644 }, { "epoch": 0.46261565391347836, "grad_norm": 0.8736371397972107, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7400, "tokens_per_second_per_gpu": 10372.32, "total_tokens": 730721727 }, { "epoch": 0.4626781695423856, "grad_norm": 0.8612183928489685, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7401, "tokens_per_second_per_gpu": 10348.12, "total_tokens": 730822538 }, { "epoch": 0.4627406851712928, "grad_norm": 0.9024666547775269, "learning_rate": 2e-05, "loss": 0.6603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7402, "tokens_per_second_per_gpu": 11037.95, "total_tokens": 730925396 }, { "epoch": 0.46280320080020004, "grad_norm": 0.8920968174934387, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7403, "tokens_per_second_per_gpu": 11024.19, "total_tokens": 731023311 }, { "epoch": 0.4628657164291073, "grad_norm": 0.925525426864624, "learning_rate": 2e-05, "loss": 0.6643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7404, "tokens_per_second_per_gpu": 10119.67, "total_tokens": 731117028 }, { "epoch": 0.4629282320580145, "grad_norm": 0.9003754258155823, "learning_rate": 2e-05, "loss": 0.6455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7405, "tokens_per_second_per_gpu": 10740.48, "total_tokens": 731215581 }, { "epoch": 0.4629907476869217, "grad_norm": 0.9275972247123718, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7406, "tokens_per_second_per_gpu": 10588.03, "total_tokens": 731319067 }, { "epoch": 0.46305326331582897, "grad_norm": 0.8867403864860535, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7407, "tokens_per_second_per_gpu": 10595.48, "total_tokens": 731419243 }, { "epoch": 0.4631157789447362, "grad_norm": 0.884556770324707, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7408, "tokens_per_second_per_gpu": 10683.35, "total_tokens": 731517999 }, { "epoch": 0.4631782945736434, "grad_norm": 0.9328817129135132, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7409, "tokens_per_second_per_gpu": 10117.49, "total_tokens": 731616124 }, { "epoch": 0.46324081020255065, "grad_norm": 0.8875576853752136, "learning_rate": 2e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7410, "tokens_per_second_per_gpu": 10496.69, "total_tokens": 731713411 }, { "epoch": 0.4633033258314579, "grad_norm": 0.8802395462989807, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7411, "tokens_per_second_per_gpu": 10637.26, "total_tokens": 731816092 }, { "epoch": 0.4633658414603651, "grad_norm": 0.8775145411491394, "learning_rate": 2e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7412, "tokens_per_second_per_gpu": 11179.44, "total_tokens": 731916004 }, { "epoch": 0.4634283570892723, "grad_norm": 0.9061028361320496, "learning_rate": 2e-05, "loss": 0.6737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7413, "tokens_per_second_per_gpu": 10480.87, "total_tokens": 732013059 }, { "epoch": 0.46349087271817957, "grad_norm": 0.913101315498352, "learning_rate": 2e-05, "loss": 0.6105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7414, "tokens_per_second_per_gpu": 10903.9, "total_tokens": 732110075 }, { "epoch": 0.46355338834708676, "grad_norm": 0.9162770509719849, "learning_rate": 2e-05, "loss": 0.6719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7415, "tokens_per_second_per_gpu": 10375.77, "total_tokens": 732209273 }, { "epoch": 0.463615903975994, "grad_norm": 0.8764663934707642, "learning_rate": 2e-05, "loss": 0.6087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7416, "tokens_per_second_per_gpu": 11084.77, "total_tokens": 732309580 }, { "epoch": 0.46367841960490125, "grad_norm": 0.8532830476760864, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7417, "tokens_per_second_per_gpu": 10517.23, "total_tokens": 732408972 }, { "epoch": 0.46374093523380844, "grad_norm": 0.8754269480705261, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7418, "tokens_per_second_per_gpu": 10818.11, "total_tokens": 732510449 }, { "epoch": 0.4638034508627157, "grad_norm": 0.9149705767631531, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7419, "tokens_per_second_per_gpu": 11060.92, "total_tokens": 732611392 }, { "epoch": 0.4638659664916229, "grad_norm": 0.9306687116622925, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7420, "tokens_per_second_per_gpu": 10983.08, "total_tokens": 732710496 }, { "epoch": 0.4639284821205301, "grad_norm": 0.9270505309104919, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7421, "tokens_per_second_per_gpu": 10182.12, "total_tokens": 732806951 }, { "epoch": 0.46399099774943736, "grad_norm": 0.9513511061668396, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7422, "tokens_per_second_per_gpu": 10948.5, "total_tokens": 732902266 }, { "epoch": 0.4640535133783446, "grad_norm": 0.8673477172851562, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7423, "tokens_per_second_per_gpu": 10872.38, "total_tokens": 733003189 }, { "epoch": 0.4641160290072518, "grad_norm": 0.8815592527389526, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7424, "tokens_per_second_per_gpu": 9760.57, "total_tokens": 733102715 }, { "epoch": 0.46417854463615904, "grad_norm": 0.9070740342140198, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7425, "tokens_per_second_per_gpu": 11128.46, "total_tokens": 733202561 }, { "epoch": 0.4642410602650663, "grad_norm": 0.8993461728096008, "learning_rate": 2e-05, "loss": 0.6448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7426, "tokens_per_second_per_gpu": 10636.24, "total_tokens": 733299631 }, { "epoch": 0.4643035758939735, "grad_norm": 0.9155744910240173, "learning_rate": 2e-05, "loss": 0.6518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7427, "tokens_per_second_per_gpu": 10247.14, "total_tokens": 733394382 }, { "epoch": 0.4643660915228807, "grad_norm": 0.8945683240890503, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7428, "tokens_per_second_per_gpu": 10755.29, "total_tokens": 733494592 }, { "epoch": 0.46442860715178796, "grad_norm": 0.9104045629501343, "learning_rate": 2e-05, "loss": 0.6486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7429, "tokens_per_second_per_gpu": 10381.08, "total_tokens": 733593665 }, { "epoch": 0.46449112278069515, "grad_norm": 0.9489929676055908, "learning_rate": 2e-05, "loss": 0.6348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7430, "tokens_per_second_per_gpu": 10261.13, "total_tokens": 733690645 }, { "epoch": 0.4645536384096024, "grad_norm": 0.9080439805984497, "learning_rate": 2e-05, "loss": 0.6178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7431, "tokens_per_second_per_gpu": 9786.24, "total_tokens": 733786307 }, { "epoch": 0.46461615403850964, "grad_norm": 0.932508111000061, "learning_rate": 2e-05, "loss": 0.6743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7432, "tokens_per_second_per_gpu": 9970.16, "total_tokens": 733883324 }, { "epoch": 0.46467866966741683, "grad_norm": 0.8962225914001465, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7433, "tokens_per_second_per_gpu": 10722.05, "total_tokens": 733984691 }, { "epoch": 0.4647411852963241, "grad_norm": 0.9305459856987, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7434, "tokens_per_second_per_gpu": 10019.84, "total_tokens": 734081707 }, { "epoch": 0.4648037009252313, "grad_norm": 0.9526340365409851, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7435, "tokens_per_second_per_gpu": 10139.85, "total_tokens": 734176542 }, { "epoch": 0.4648662165541385, "grad_norm": 0.9333956241607666, "learning_rate": 2e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7436, "tokens_per_second_per_gpu": 9728.18, "total_tokens": 734270270 }, { "epoch": 0.46492873218304576, "grad_norm": 0.8777334690093994, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7437, "tokens_per_second_per_gpu": 10327.64, "total_tokens": 734367906 }, { "epoch": 0.464991247811953, "grad_norm": 0.9386419653892517, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7438, "tokens_per_second_per_gpu": 10437.43, "total_tokens": 734463699 }, { "epoch": 0.4650537634408602, "grad_norm": 0.9030951261520386, "learning_rate": 2e-05, "loss": 0.6785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7439, "tokens_per_second_per_gpu": 10881.84, "total_tokens": 734564439 }, { "epoch": 0.46511627906976744, "grad_norm": 0.8882812261581421, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7440, "tokens_per_second_per_gpu": 10296.71, "total_tokens": 734660430 }, { "epoch": 0.4651787946986747, "grad_norm": 0.8970914483070374, "learning_rate": 2e-05, "loss": 0.6514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7441, "tokens_per_second_per_gpu": 10405.38, "total_tokens": 734763316 }, { "epoch": 0.46524131032758187, "grad_norm": 0.8682104349136353, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7442, "tokens_per_second_per_gpu": 10678.38, "total_tokens": 734862768 }, { "epoch": 0.4653038259564891, "grad_norm": 0.877457320690155, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7443, "tokens_per_second_per_gpu": 10172.17, "total_tokens": 734959835 }, { "epoch": 0.46536634158539636, "grad_norm": 0.8864495754241943, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7444, "tokens_per_second_per_gpu": 11123.09, "total_tokens": 735061101 }, { "epoch": 0.46542885721430355, "grad_norm": 0.8855008482933044, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7445, "tokens_per_second_per_gpu": 10611.96, "total_tokens": 735158655 }, { "epoch": 0.4654913728432108, "grad_norm": 0.9442863464355469, "learning_rate": 2e-05, "loss": 0.6847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7446, "tokens_per_second_per_gpu": 10700.78, "total_tokens": 735255965 }, { "epoch": 0.46555388847211804, "grad_norm": 0.8843913674354553, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7447, "tokens_per_second_per_gpu": 11175.43, "total_tokens": 735351734 }, { "epoch": 0.4656164041010253, "grad_norm": 0.8876384496688843, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7448, "tokens_per_second_per_gpu": 10554.46, "total_tokens": 735452568 }, { "epoch": 0.4656789197299325, "grad_norm": 0.9458191990852356, "learning_rate": 2e-05, "loss": 0.6918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7449, "tokens_per_second_per_gpu": 10038.82, "total_tokens": 735548969 }, { "epoch": 0.4657414353588397, "grad_norm": 0.8813024163246155, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7450, "tokens_per_second_per_gpu": 11134.25, "total_tokens": 735644706 }, { "epoch": 0.46580395098774696, "grad_norm": 0.8980368375778198, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7451, "tokens_per_second_per_gpu": 10980.28, "total_tokens": 735746869 }, { "epoch": 0.46586646661665415, "grad_norm": 0.8920280933380127, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7452, "tokens_per_second_per_gpu": 11053.81, "total_tokens": 735844092 }, { "epoch": 0.4659289822455614, "grad_norm": 0.9376959204673767, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7453, "tokens_per_second_per_gpu": 9746.5, "total_tokens": 735937554 }, { "epoch": 0.46599149787446864, "grad_norm": 0.8598536849021912, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7454, "tokens_per_second_per_gpu": 10520.07, "total_tokens": 736036510 }, { "epoch": 0.46605401350337583, "grad_norm": 0.9002487659454346, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7455, "tokens_per_second_per_gpu": 10717.67, "total_tokens": 736133058 }, { "epoch": 0.4661165291322831, "grad_norm": 0.9106837511062622, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7456, "tokens_per_second_per_gpu": 10626.21, "total_tokens": 736238317 }, { "epoch": 0.4661790447611903, "grad_norm": 0.899539589881897, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7457, "tokens_per_second_per_gpu": 9900.83, "total_tokens": 736339382 }, { "epoch": 0.4662415603900975, "grad_norm": 0.8732595443725586, "learning_rate": 2e-05, "loss": 0.6017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7458, "tokens_per_second_per_gpu": 9346.22, "total_tokens": 736432344 }, { "epoch": 0.46630407601900475, "grad_norm": 0.906438946723938, "learning_rate": 2e-05, "loss": 0.7165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7459, "tokens_per_second_per_gpu": 10911.48, "total_tokens": 736533480 }, { "epoch": 0.466366591647912, "grad_norm": 0.9008656740188599, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7460, "tokens_per_second_per_gpu": 11026.08, "total_tokens": 736635384 }, { "epoch": 0.4664291072768192, "grad_norm": 0.9295186996459961, "learning_rate": 2e-05, "loss": 0.6823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7461, "tokens_per_second_per_gpu": 10687.28, "total_tokens": 736735537 }, { "epoch": 0.46649162290572643, "grad_norm": 0.889243483543396, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7462, "tokens_per_second_per_gpu": 10881.67, "total_tokens": 736833579 }, { "epoch": 0.4665541385346337, "grad_norm": 0.8660889267921448, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7463, "tokens_per_second_per_gpu": 10642.67, "total_tokens": 736935761 }, { "epoch": 0.46661665416354087, "grad_norm": 0.884080171585083, "learning_rate": 2e-05, "loss": 0.6655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7464, "tokens_per_second_per_gpu": 10812.56, "total_tokens": 737037031 }, { "epoch": 0.4666791697924481, "grad_norm": 0.8934435248374939, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7465, "tokens_per_second_per_gpu": 10955.9, "total_tokens": 737137527 }, { "epoch": 0.46674168542135536, "grad_norm": 0.877741277217865, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7466, "tokens_per_second_per_gpu": 10695.82, "total_tokens": 737232161 }, { "epoch": 0.46680420105026255, "grad_norm": 0.8844292759895325, "learning_rate": 2e-05, "loss": 0.5934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7467, "tokens_per_second_per_gpu": 10358.74, "total_tokens": 737325418 }, { "epoch": 0.4668667166791698, "grad_norm": 0.8633629679679871, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7468, "tokens_per_second_per_gpu": 10740.66, "total_tokens": 737423607 }, { "epoch": 0.46692923230807704, "grad_norm": 0.8718348145484924, "learning_rate": 2e-05, "loss": 0.6924, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7469, "tokens_per_second_per_gpu": 11057.27, "total_tokens": 737523096 }, { "epoch": 0.4669917479369842, "grad_norm": 0.8410258889198303, "learning_rate": 2e-05, "loss": 0.6406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7470, "tokens_per_second_per_gpu": 11426.78, "total_tokens": 737624829 }, { "epoch": 0.46705426356589147, "grad_norm": 0.8842764496803284, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7471, "tokens_per_second_per_gpu": 11210.71, "total_tokens": 737728167 }, { "epoch": 0.4671167791947987, "grad_norm": 0.8637673854827881, "learning_rate": 2e-05, "loss": 0.6511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7472, "tokens_per_second_per_gpu": 10414.06, "total_tokens": 737828731 }, { "epoch": 0.4671792948237059, "grad_norm": 0.8894246816635132, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7473, "tokens_per_second_per_gpu": 10368.81, "total_tokens": 737925526 }, { "epoch": 0.46724181045261315, "grad_norm": 0.8906306028366089, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7474, "tokens_per_second_per_gpu": 10973.05, "total_tokens": 738026809 }, { "epoch": 0.4673043260815204, "grad_norm": 0.8809357285499573, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7475, "tokens_per_second_per_gpu": 10511.17, "total_tokens": 738127962 }, { "epoch": 0.4673668417104276, "grad_norm": 0.8529887199401855, "learning_rate": 2e-05, "loss": 0.605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7476, "tokens_per_second_per_gpu": 9334.18, "total_tokens": 738221900 }, { "epoch": 0.46742935733933483, "grad_norm": 0.9063286185264587, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7477, "tokens_per_second_per_gpu": 10621.98, "total_tokens": 738319685 }, { "epoch": 0.4674918729682421, "grad_norm": 0.9065563678741455, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7478, "tokens_per_second_per_gpu": 9754.96, "total_tokens": 738414906 }, { "epoch": 0.46755438859714926, "grad_norm": 0.8416316509246826, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7479, "tokens_per_second_per_gpu": 11053.88, "total_tokens": 738515125 }, { "epoch": 0.4676169042260565, "grad_norm": 0.8953467011451721, "learning_rate": 2e-05, "loss": 0.597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7480, "tokens_per_second_per_gpu": 9780.94, "total_tokens": 738608474 }, { "epoch": 0.46767941985496375, "grad_norm": 0.867882251739502, "learning_rate": 2e-05, "loss": 0.6066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7481, "tokens_per_second_per_gpu": 10399.44, "total_tokens": 738707109 }, { "epoch": 0.46774193548387094, "grad_norm": 0.878134548664093, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7482, "tokens_per_second_per_gpu": 10036.84, "total_tokens": 738803995 }, { "epoch": 0.4678044511127782, "grad_norm": 0.8784801363945007, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7483, "tokens_per_second_per_gpu": 9896.57, "total_tokens": 738904772 }, { "epoch": 0.46786696674168543, "grad_norm": 0.8952184319496155, "learning_rate": 2e-05, "loss": 0.592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7484, "tokens_per_second_per_gpu": 9950.41, "total_tokens": 738997944 }, { "epoch": 0.4679294823705927, "grad_norm": 0.866499662399292, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7485, "tokens_per_second_per_gpu": 10837.33, "total_tokens": 739099641 }, { "epoch": 0.46799199799949986, "grad_norm": 0.9553094506263733, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7486, "tokens_per_second_per_gpu": 11310.44, "total_tokens": 739196477 }, { "epoch": 0.4680545136284071, "grad_norm": 0.9085789322853088, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7487, "tokens_per_second_per_gpu": 10475.71, "total_tokens": 739297069 }, { "epoch": 0.46811702925731435, "grad_norm": 0.8990103006362915, "learning_rate": 2e-05, "loss": 0.6002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7488, "tokens_per_second_per_gpu": 10102.02, "total_tokens": 739392031 }, { "epoch": 0.46817954488622154, "grad_norm": 0.9015179872512817, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7489, "tokens_per_second_per_gpu": 11083.7, "total_tokens": 739492108 }, { "epoch": 0.4682420605151288, "grad_norm": 0.914183497428894, "learning_rate": 2e-05, "loss": 0.6495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7490, "tokens_per_second_per_gpu": 9889.24, "total_tokens": 739588766 }, { "epoch": 0.46830457614403603, "grad_norm": 0.8726728558540344, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7491, "tokens_per_second_per_gpu": 10531.8, "total_tokens": 739689177 }, { "epoch": 0.4683670917729432, "grad_norm": 0.9183850288391113, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7492, "tokens_per_second_per_gpu": 10220.31, "total_tokens": 739784503 }, { "epoch": 0.46842960740185047, "grad_norm": 0.8737242817878723, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7493, "tokens_per_second_per_gpu": 10092.26, "total_tokens": 739882719 }, { "epoch": 0.4684921230307577, "grad_norm": 0.8683391809463501, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7494, "tokens_per_second_per_gpu": 10886.47, "total_tokens": 739979328 }, { "epoch": 0.4685546386596649, "grad_norm": 0.915401816368103, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7495, "tokens_per_second_per_gpu": 11442.24, "total_tokens": 740082508 }, { "epoch": 0.46861715428857215, "grad_norm": 0.8545110821723938, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7496, "tokens_per_second_per_gpu": 10596.77, "total_tokens": 740181226 }, { "epoch": 0.4686796699174794, "grad_norm": 0.8945276737213135, "learning_rate": 2e-05, "loss": 0.6858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7497, "tokens_per_second_per_gpu": 10423.0, "total_tokens": 740281396 }, { "epoch": 0.4687421855463866, "grad_norm": 0.8845691084861755, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7498, "tokens_per_second_per_gpu": 10047.19, "total_tokens": 740379397 }, { "epoch": 0.4688047011752938, "grad_norm": 0.8682519793510437, "learning_rate": 2e-05, "loss": 0.6561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7499, "tokens_per_second_per_gpu": 10582.08, "total_tokens": 740480910 }, { "epoch": 0.46886721680420107, "grad_norm": 0.8945838809013367, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7500, "tokens_per_second_per_gpu": 11409.35, "total_tokens": 740580961 }, { "epoch": 0.46892973243310826, "grad_norm": 0.9057461023330688, "learning_rate": 2e-05, "loss": 0.6473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7501, "tokens_per_second_per_gpu": 10571.15, "total_tokens": 740677412 }, { "epoch": 0.4689922480620155, "grad_norm": 0.8716968297958374, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7502, "tokens_per_second_per_gpu": 11284.3, "total_tokens": 740777500 }, { "epoch": 0.46905476369092275, "grad_norm": 0.8695382475852966, "learning_rate": 2e-05, "loss": 0.6685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7503, "tokens_per_second_per_gpu": 10575.61, "total_tokens": 740882286 }, { "epoch": 0.46911727931982994, "grad_norm": 0.9125921726226807, "learning_rate": 2e-05, "loss": 0.6435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7504, "tokens_per_second_per_gpu": 11669.47, "total_tokens": 740986244 }, { "epoch": 0.4691797949487372, "grad_norm": 0.8691465258598328, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7505, "tokens_per_second_per_gpu": 10552.24, "total_tokens": 741088555 }, { "epoch": 0.46924231057764443, "grad_norm": 0.9015362858772278, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7506, "tokens_per_second_per_gpu": 9834.23, "total_tokens": 741181388 }, { "epoch": 0.4693048262065516, "grad_norm": 0.8922246694564819, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7507, "tokens_per_second_per_gpu": 10047.11, "total_tokens": 741280145 }, { "epoch": 0.46936734183545886, "grad_norm": 0.8923696279525757, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7508, "tokens_per_second_per_gpu": 10467.63, "total_tokens": 741380350 }, { "epoch": 0.4694298574643661, "grad_norm": 0.9120371341705322, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7509, "tokens_per_second_per_gpu": 10317.82, "total_tokens": 741477184 }, { "epoch": 0.4694923730932733, "grad_norm": 0.8922126293182373, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7510, "tokens_per_second_per_gpu": 11293.64, "total_tokens": 741578760 }, { "epoch": 0.46955488872218054, "grad_norm": 0.8750907182693481, "learning_rate": 2e-05, "loss": 0.6756, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7511, "tokens_per_second_per_gpu": 11030.93, "total_tokens": 741680766 }, { "epoch": 0.4696174043510878, "grad_norm": 0.8565540313720703, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7512, "tokens_per_second_per_gpu": 10554.05, "total_tokens": 741780517 }, { "epoch": 0.469679919979995, "grad_norm": 0.8793808817863464, "learning_rate": 2e-05, "loss": 0.7003, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7513, "tokens_per_second_per_gpu": 11241.03, "total_tokens": 741883717 }, { "epoch": 0.4697424356089022, "grad_norm": 0.9742761254310608, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7514, "tokens_per_second_per_gpu": 9849.39, "total_tokens": 741978368 }, { "epoch": 0.46980495123780946, "grad_norm": 0.930931568145752, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7515, "tokens_per_second_per_gpu": 10128.84, "total_tokens": 742076555 }, { "epoch": 0.46986746686671665, "grad_norm": 0.8702473640441895, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7516, "tokens_per_second_per_gpu": 10154.9, "total_tokens": 742173161 }, { "epoch": 0.4699299824956239, "grad_norm": 0.8609806895256042, "learning_rate": 2e-05, "loss": 0.6483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7517, "tokens_per_second_per_gpu": 10746.84, "total_tokens": 742275419 }, { "epoch": 0.46999249812453114, "grad_norm": 0.9307698607444763, "learning_rate": 2e-05, "loss": 0.6945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7518, "tokens_per_second_per_gpu": 10690.18, "total_tokens": 742370939 }, { "epoch": 0.47005501375343833, "grad_norm": 0.9256975054740906, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7519, "tokens_per_second_per_gpu": 10443.13, "total_tokens": 742469078 }, { "epoch": 0.4701175293823456, "grad_norm": 0.9128365516662598, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7520, "tokens_per_second_per_gpu": 10274.22, "total_tokens": 742564374 }, { "epoch": 0.4701800450112528, "grad_norm": 0.9051207900047302, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7521, "tokens_per_second_per_gpu": 10117.16, "total_tokens": 742666169 }, { "epoch": 0.47024256064016007, "grad_norm": 0.8730947375297546, "learning_rate": 2e-05, "loss": 0.6002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7522, "tokens_per_second_per_gpu": 10211.06, "total_tokens": 742765709 }, { "epoch": 0.47030507626906726, "grad_norm": 0.8587074875831604, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7523, "tokens_per_second_per_gpu": 9889.78, "total_tokens": 742866890 }, { "epoch": 0.4703675918979745, "grad_norm": 0.8874419927597046, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7524, "tokens_per_second_per_gpu": 9444.92, "total_tokens": 742960506 }, { "epoch": 0.47043010752688175, "grad_norm": 0.9041507244110107, "learning_rate": 2e-05, "loss": 0.7, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7525, "tokens_per_second_per_gpu": 10615.5, "total_tokens": 743063630 }, { "epoch": 0.47049262315578894, "grad_norm": 0.9019749164581299, "learning_rate": 2e-05, "loss": 0.5916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7526, "tokens_per_second_per_gpu": 10668.91, "total_tokens": 743159071 }, { "epoch": 0.4705551387846962, "grad_norm": 0.837659478187561, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7527, "tokens_per_second_per_gpu": 10821.75, "total_tokens": 743261778 }, { "epoch": 0.4706176544136034, "grad_norm": 0.8573055863380432, "learning_rate": 2e-05, "loss": 0.6633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7528, "tokens_per_second_per_gpu": 10411.7, "total_tokens": 743361694 }, { "epoch": 0.4706801700425106, "grad_norm": 0.8669008612632751, "learning_rate": 2e-05, "loss": 0.579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7529, "tokens_per_second_per_gpu": 9762.33, "total_tokens": 743458112 }, { "epoch": 0.47074268567141786, "grad_norm": 0.8515707850456238, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7530, "tokens_per_second_per_gpu": 9342.41, "total_tokens": 743554292 }, { "epoch": 0.4708052013003251, "grad_norm": 0.9029080271720886, "learning_rate": 2e-05, "loss": 0.6692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7531, "tokens_per_second_per_gpu": 10539.0, "total_tokens": 743654040 }, { "epoch": 0.4708677169292323, "grad_norm": 0.9126076698303223, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7532, "tokens_per_second_per_gpu": 10601.77, "total_tokens": 743753523 }, { "epoch": 0.47093023255813954, "grad_norm": 0.8965096473693848, "learning_rate": 2e-05, "loss": 0.6422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7533, "tokens_per_second_per_gpu": 10446.58, "total_tokens": 743850553 }, { "epoch": 0.4709927481870468, "grad_norm": 0.852328360080719, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7534, "tokens_per_second_per_gpu": 10705.31, "total_tokens": 743950169 }, { "epoch": 0.471055263815954, "grad_norm": 0.8545737862586975, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7535, "tokens_per_second_per_gpu": 10765.38, "total_tokens": 744049478 }, { "epoch": 0.4711177794448612, "grad_norm": 0.9154372215270996, "learning_rate": 2e-05, "loss": 0.6369, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7536, "tokens_per_second_per_gpu": 10770.19, "total_tokens": 744144732 }, { "epoch": 0.47118029507376846, "grad_norm": 0.9075828790664673, "learning_rate": 2e-05, "loss": 0.6426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7537, "tokens_per_second_per_gpu": 10840.82, "total_tokens": 744248340 }, { "epoch": 0.47124281070267565, "grad_norm": 0.89801424741745, "learning_rate": 2e-05, "loss": 0.6845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7538, "tokens_per_second_per_gpu": 10201.87, "total_tokens": 744344732 }, { "epoch": 0.4713053263315829, "grad_norm": 0.8735082149505615, "learning_rate": 2e-05, "loss": 0.6699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7539, "tokens_per_second_per_gpu": 10816.45, "total_tokens": 744447382 }, { "epoch": 0.47136784196049014, "grad_norm": 0.8579781651496887, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7540, "tokens_per_second_per_gpu": 10374.55, "total_tokens": 744550301 }, { "epoch": 0.47143035758939733, "grad_norm": 0.8724321126937866, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7541, "tokens_per_second_per_gpu": 9812.67, "total_tokens": 744648254 }, { "epoch": 0.4714928732183046, "grad_norm": 0.9081611633300781, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7542, "tokens_per_second_per_gpu": 10750.08, "total_tokens": 744741683 }, { "epoch": 0.4715553888472118, "grad_norm": 0.9042350053787231, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7543, "tokens_per_second_per_gpu": 10823.75, "total_tokens": 744842126 }, { "epoch": 0.471617904476119, "grad_norm": 0.8957961201667786, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7544, "tokens_per_second_per_gpu": 10961.54, "total_tokens": 744943870 }, { "epoch": 0.47168042010502625, "grad_norm": 0.8457605838775635, "learning_rate": 2e-05, "loss": 0.6044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7545, "tokens_per_second_per_gpu": 10417.43, "total_tokens": 745042072 }, { "epoch": 0.4717429357339335, "grad_norm": 0.8944460153579712, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7546, "tokens_per_second_per_gpu": 10257.35, "total_tokens": 745139703 }, { "epoch": 0.4718054513628407, "grad_norm": 0.8688267469406128, "learning_rate": 2e-05, "loss": 0.6628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7547, "tokens_per_second_per_gpu": 11046.95, "total_tokens": 745243336 }, { "epoch": 0.47186796699174793, "grad_norm": 0.8745776414871216, "learning_rate": 2e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7548, "tokens_per_second_per_gpu": 10238.49, "total_tokens": 745343441 }, { "epoch": 0.4719304826206552, "grad_norm": 0.8773716688156128, "learning_rate": 2e-05, "loss": 0.593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7549, "tokens_per_second_per_gpu": 10872.77, "total_tokens": 745444473 }, { "epoch": 0.47199299824956237, "grad_norm": 0.8854591846466064, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7550, "tokens_per_second_per_gpu": 9937.86, "total_tokens": 745543406 }, { "epoch": 0.4720555138784696, "grad_norm": 0.8975232839584351, "learning_rate": 2e-05, "loss": 0.6455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7551, "tokens_per_second_per_gpu": 10211.81, "total_tokens": 745640242 }, { "epoch": 0.47211802950737686, "grad_norm": 0.8943763375282288, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7552, "tokens_per_second_per_gpu": 10321.12, "total_tokens": 745736043 }, { "epoch": 0.47218054513628405, "grad_norm": 0.9299323558807373, "learning_rate": 2e-05, "loss": 0.693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7553, "tokens_per_second_per_gpu": 10530.05, "total_tokens": 745834814 }, { "epoch": 0.4722430607651913, "grad_norm": 0.8908074498176575, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7554, "tokens_per_second_per_gpu": 11226.69, "total_tokens": 745935776 }, { "epoch": 0.47230557639409854, "grad_norm": 0.882809042930603, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7555, "tokens_per_second_per_gpu": 9897.41, "total_tokens": 746031589 }, { "epoch": 0.4723680920230057, "grad_norm": 0.9736095666885376, "learning_rate": 2e-05, "loss": 0.6318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7556, "tokens_per_second_per_gpu": 9960.97, "total_tokens": 746130354 }, { "epoch": 0.47243060765191297, "grad_norm": 0.846753716468811, "learning_rate": 2e-05, "loss": 0.6075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7557, "tokens_per_second_per_gpu": 10444.87, "total_tokens": 746229859 }, { "epoch": 0.4724931232808202, "grad_norm": 0.8755313158035278, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7558, "tokens_per_second_per_gpu": 10863.2, "total_tokens": 746332140 }, { "epoch": 0.4725556389097274, "grad_norm": 0.9093614220619202, "learning_rate": 2e-05, "loss": 0.663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7559, "tokens_per_second_per_gpu": 10087.95, "total_tokens": 746427704 }, { "epoch": 0.47261815453863465, "grad_norm": 0.920120894908905, "learning_rate": 2e-05, "loss": 0.7002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7560, "tokens_per_second_per_gpu": 10320.9, "total_tokens": 746527379 }, { "epoch": 0.4726806701675419, "grad_norm": 0.9248592853546143, "learning_rate": 2e-05, "loss": 0.6496, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7561, "tokens_per_second_per_gpu": 10871.29, "total_tokens": 746628845 }, { "epoch": 0.47274318579644914, "grad_norm": 0.8740406036376953, "learning_rate": 2e-05, "loss": 0.6552, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7562, "tokens_per_second_per_gpu": 11169.75, "total_tokens": 746735062 }, { "epoch": 0.47280570142535633, "grad_norm": 0.9036451578140259, "learning_rate": 2e-05, "loss": 0.652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7563, "tokens_per_second_per_gpu": 10036.83, "total_tokens": 746830498 }, { "epoch": 0.4728682170542636, "grad_norm": 0.8921274542808533, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7564, "tokens_per_second_per_gpu": 9834.66, "total_tokens": 746925969 }, { "epoch": 0.4729307326831708, "grad_norm": 0.9013462066650391, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7565, "tokens_per_second_per_gpu": 10476.31, "total_tokens": 747023389 }, { "epoch": 0.472993248312078, "grad_norm": 0.8984540700912476, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7566, "tokens_per_second_per_gpu": 10541.75, "total_tokens": 747124373 }, { "epoch": 0.47305576394098525, "grad_norm": 0.9296323657035828, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7567, "tokens_per_second_per_gpu": 10588.62, "total_tokens": 747220184 }, { "epoch": 0.4731182795698925, "grad_norm": 0.8897682428359985, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7568, "tokens_per_second_per_gpu": 10243.71, "total_tokens": 747319136 }, { "epoch": 0.4731807951987997, "grad_norm": 0.863457202911377, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7569, "tokens_per_second_per_gpu": 11118.23, "total_tokens": 747422870 }, { "epoch": 0.47324331082770693, "grad_norm": 0.9002708196640015, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7570, "tokens_per_second_per_gpu": 9740.19, "total_tokens": 747520676 }, { "epoch": 0.4733058264566142, "grad_norm": 0.9024486541748047, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7571, "tokens_per_second_per_gpu": 11372.65, "total_tokens": 747625089 }, { "epoch": 0.47336834208552137, "grad_norm": 0.9349262118339539, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7572, "tokens_per_second_per_gpu": 10588.7, "total_tokens": 747722194 }, { "epoch": 0.4734308577144286, "grad_norm": 0.8867911100387573, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7573, "tokens_per_second_per_gpu": 11069.12, "total_tokens": 747823998 }, { "epoch": 0.47349337334333585, "grad_norm": 0.8974500894546509, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7574, "tokens_per_second_per_gpu": 10198.27, "total_tokens": 747921175 }, { "epoch": 0.47355588897224304, "grad_norm": 0.9417710304260254, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7575, "tokens_per_second_per_gpu": 10814.71, "total_tokens": 748021638 }, { "epoch": 0.4736184046011503, "grad_norm": 0.8946247100830078, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7576, "tokens_per_second_per_gpu": 10781.66, "total_tokens": 748121840 }, { "epoch": 0.47368092023005753, "grad_norm": 0.8835170865058899, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7577, "tokens_per_second_per_gpu": 10926.96, "total_tokens": 748223520 }, { "epoch": 0.4737434358589647, "grad_norm": 0.9319185018539429, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7578, "tokens_per_second_per_gpu": 10519.19, "total_tokens": 748318644 }, { "epoch": 0.47380595148787197, "grad_norm": 0.924608051776886, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7579, "tokens_per_second_per_gpu": 10448.84, "total_tokens": 748414699 }, { "epoch": 0.4738684671167792, "grad_norm": 0.8597116470336914, "learning_rate": 2e-05, "loss": 0.6369, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7580, "tokens_per_second_per_gpu": 10564.38, "total_tokens": 748514783 }, { "epoch": 0.4739309827456864, "grad_norm": 0.8831878900527954, "learning_rate": 2e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7581, "tokens_per_second_per_gpu": 10324.95, "total_tokens": 748613166 }, { "epoch": 0.47399349837459365, "grad_norm": 0.8947650194168091, "learning_rate": 2e-05, "loss": 0.681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7582, "tokens_per_second_per_gpu": 10468.27, "total_tokens": 748713027 }, { "epoch": 0.4740560140035009, "grad_norm": 0.8940874934196472, "learning_rate": 2e-05, "loss": 0.6487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7583, "tokens_per_second_per_gpu": 10914.41, "total_tokens": 748813797 }, { "epoch": 0.4741185296324081, "grad_norm": 0.861479640007019, "learning_rate": 2e-05, "loss": 0.622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7584, "tokens_per_second_per_gpu": 9618.54, "total_tokens": 748914444 }, { "epoch": 0.4741810452613153, "grad_norm": 0.876535177230835, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7585, "tokens_per_second_per_gpu": 9939.27, "total_tokens": 749015004 }, { "epoch": 0.47424356089022257, "grad_norm": 0.872510552406311, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7586, "tokens_per_second_per_gpu": 10525.04, "total_tokens": 749114050 }, { "epoch": 0.47430607651912976, "grad_norm": 0.8343330025672913, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7587, "tokens_per_second_per_gpu": 11039.41, "total_tokens": 749213671 }, { "epoch": 0.474368592148037, "grad_norm": 0.8804579973220825, "learning_rate": 2e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7588, "tokens_per_second_per_gpu": 11588.1, "total_tokens": 749315934 }, { "epoch": 0.47443110777694425, "grad_norm": 0.876654326915741, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7589, "tokens_per_second_per_gpu": 11343.35, "total_tokens": 749417274 }, { "epoch": 0.47449362340585144, "grad_norm": 0.9011462926864624, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7590, "tokens_per_second_per_gpu": 10554.95, "total_tokens": 749515258 }, { "epoch": 0.4745561390347587, "grad_norm": 0.8461520075798035, "learning_rate": 2e-05, "loss": 0.6053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7591, "tokens_per_second_per_gpu": 10535.34, "total_tokens": 749615402 }, { "epoch": 0.47461865466366593, "grad_norm": 0.8751388788223267, "learning_rate": 2e-05, "loss": 0.6538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7592, "tokens_per_second_per_gpu": 10390.58, "total_tokens": 749713779 }, { "epoch": 0.4746811702925731, "grad_norm": 0.9448792934417725, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7593, "tokens_per_second_per_gpu": 9993.96, "total_tokens": 749813058 }, { "epoch": 0.47474368592148036, "grad_norm": 0.9024336338043213, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7594, "tokens_per_second_per_gpu": 10200.33, "total_tokens": 749910309 }, { "epoch": 0.4748062015503876, "grad_norm": 0.898304283618927, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7595, "tokens_per_second_per_gpu": 10542.44, "total_tokens": 750009040 }, { "epoch": 0.4748687171792948, "grad_norm": 0.9128329753875732, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7596, "tokens_per_second_per_gpu": 10897.34, "total_tokens": 750109970 }, { "epoch": 0.47493123280820204, "grad_norm": 0.9111632704734802, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7597, "tokens_per_second_per_gpu": 11075.61, "total_tokens": 750207676 }, { "epoch": 0.4749937484371093, "grad_norm": 1.05693781375885, "learning_rate": 2e-05, "loss": 0.6011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7598, "tokens_per_second_per_gpu": 10211.36, "total_tokens": 750306141 }, { "epoch": 0.47505626406601653, "grad_norm": 0.8587053418159485, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7599, "tokens_per_second_per_gpu": 10699.28, "total_tokens": 750406691 }, { "epoch": 0.4751187796949237, "grad_norm": 0.8984963893890381, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7600, "tokens_per_second_per_gpu": 10694.38, "total_tokens": 750506975 }, { "epoch": 0.47518129532383097, "grad_norm": 0.8972057104110718, "learning_rate": 2e-05, "loss": 0.6356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7601, "tokens_per_second_per_gpu": 11104.8, "total_tokens": 750605951 }, { "epoch": 0.4752438109527382, "grad_norm": 0.9231801629066467, "learning_rate": 2e-05, "loss": 0.6564, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7602, "tokens_per_second_per_gpu": 14324.43, "total_tokens": 750701250 }, { "epoch": 0.4753063265816454, "grad_norm": 0.8965283632278442, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7603, "tokens_per_second_per_gpu": 10272.68, "total_tokens": 750797089 }, { "epoch": 0.47536884221055264, "grad_norm": 0.8886910676956177, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7604, "tokens_per_second_per_gpu": 10662.07, "total_tokens": 750897008 }, { "epoch": 0.4754313578394599, "grad_norm": 0.9601432681083679, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7605, "tokens_per_second_per_gpu": 8875.76, "total_tokens": 750986601 }, { "epoch": 0.4754938734683671, "grad_norm": 0.920272171497345, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7606, "tokens_per_second_per_gpu": 10674.25, "total_tokens": 751082615 }, { "epoch": 0.4755563890972743, "grad_norm": 0.8591728210449219, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7607, "tokens_per_second_per_gpu": 10597.73, "total_tokens": 751180645 }, { "epoch": 0.47561890472618157, "grad_norm": 0.8991831541061401, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7608, "tokens_per_second_per_gpu": 11139.15, "total_tokens": 751280329 }, { "epoch": 0.47568142035508876, "grad_norm": 0.8823270201683044, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7609, "tokens_per_second_per_gpu": 11383.09, "total_tokens": 751382215 }, { "epoch": 0.475743935983996, "grad_norm": 0.9435795545578003, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7610, "tokens_per_second_per_gpu": 9487.87, "total_tokens": 751474793 }, { "epoch": 0.47580645161290325, "grad_norm": 0.9585411548614502, "learning_rate": 2e-05, "loss": 0.6873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7611, "tokens_per_second_per_gpu": 10562.08, "total_tokens": 751573616 }, { "epoch": 0.47586896724181044, "grad_norm": 0.8820030689239502, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7612, "tokens_per_second_per_gpu": 10498.14, "total_tokens": 751674183 }, { "epoch": 0.4759314828707177, "grad_norm": 0.8529105186462402, "learning_rate": 2e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7613, "tokens_per_second_per_gpu": 10081.53, "total_tokens": 751772681 }, { "epoch": 0.4759939984996249, "grad_norm": 0.8923335671424866, "learning_rate": 2e-05, "loss": 0.6551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7614, "tokens_per_second_per_gpu": 11008.03, "total_tokens": 751873873 }, { "epoch": 0.4760565141285321, "grad_norm": 0.892758846282959, "learning_rate": 2e-05, "loss": 0.6518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7615, "tokens_per_second_per_gpu": 10566.86, "total_tokens": 751970813 }, { "epoch": 0.47611902975743936, "grad_norm": 0.9004063010215759, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7616, "tokens_per_second_per_gpu": 10579.75, "total_tokens": 752067765 }, { "epoch": 0.4761815453863466, "grad_norm": 0.8844404816627502, "learning_rate": 2e-05, "loss": 0.6663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7617, "tokens_per_second_per_gpu": 11068.12, "total_tokens": 752166660 }, { "epoch": 0.4762440610152538, "grad_norm": 0.8745349645614624, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7618, "tokens_per_second_per_gpu": 10107.04, "total_tokens": 752261650 }, { "epoch": 0.47630657664416104, "grad_norm": 0.9076001048088074, "learning_rate": 2e-05, "loss": 0.6699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7619, "tokens_per_second_per_gpu": 10432.18, "total_tokens": 752360312 }, { "epoch": 0.4763690922730683, "grad_norm": 0.8572481870651245, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7620, "tokens_per_second_per_gpu": 10358.83, "total_tokens": 752461129 }, { "epoch": 0.4764316079019755, "grad_norm": 0.8620836734771729, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7621, "tokens_per_second_per_gpu": 10520.41, "total_tokens": 752558611 }, { "epoch": 0.4764941235308827, "grad_norm": 0.9026281833648682, "learning_rate": 2e-05, "loss": 0.6979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7622, "tokens_per_second_per_gpu": 11046.32, "total_tokens": 752659489 }, { "epoch": 0.47655663915978996, "grad_norm": 0.9576801657676697, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7623, "tokens_per_second_per_gpu": 10177.12, "total_tokens": 752757262 }, { "epoch": 0.47661915478869715, "grad_norm": 0.8938007950782776, "learning_rate": 2e-05, "loss": 0.6534, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7624, "tokens_per_second_per_gpu": 10755.64, "total_tokens": 752855961 }, { "epoch": 0.4766816704176044, "grad_norm": 0.9028609991073608, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7625, "tokens_per_second_per_gpu": 10148.67, "total_tokens": 752954161 }, { "epoch": 0.47674418604651164, "grad_norm": 0.8806573152542114, "learning_rate": 2e-05, "loss": 0.5878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7626, "tokens_per_second_per_gpu": 9496.65, "total_tokens": 753045904 }, { "epoch": 0.47680670167541883, "grad_norm": 0.8658377528190613, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7627, "tokens_per_second_per_gpu": 10853.07, "total_tokens": 753145209 }, { "epoch": 0.4768692173043261, "grad_norm": 0.9106963276863098, "learning_rate": 2e-05, "loss": 0.6406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7628, "tokens_per_second_per_gpu": 10312.89, "total_tokens": 753242164 }, { "epoch": 0.4769317329332333, "grad_norm": 0.8741430044174194, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7629, "tokens_per_second_per_gpu": 9993.21, "total_tokens": 753337771 }, { "epoch": 0.4769942485621405, "grad_norm": 0.889802873134613, "learning_rate": 2e-05, "loss": 0.6788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7630, "tokens_per_second_per_gpu": 10894.97, "total_tokens": 753436837 }, { "epoch": 0.47705676419104776, "grad_norm": 0.9439026713371277, "learning_rate": 2e-05, "loss": 0.6859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7631, "tokens_per_second_per_gpu": 11261.38, "total_tokens": 753535513 }, { "epoch": 0.477119279819955, "grad_norm": 0.8515987396240234, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7632, "tokens_per_second_per_gpu": 10502.48, "total_tokens": 753636618 }, { "epoch": 0.4771817954488622, "grad_norm": 0.8580708503723145, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7633, "tokens_per_second_per_gpu": 10739.44, "total_tokens": 753740183 }, { "epoch": 0.47724431107776943, "grad_norm": 0.8699296116828918, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7634, "tokens_per_second_per_gpu": 10074.54, "total_tokens": 753837179 }, { "epoch": 0.4773068267066767, "grad_norm": 0.9051982164382935, "learning_rate": 2e-05, "loss": 0.6489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7635, "tokens_per_second_per_gpu": 10052.43, "total_tokens": 753935223 }, { "epoch": 0.47736934233558387, "grad_norm": 0.881412148475647, "learning_rate": 2e-05, "loss": 0.6479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7636, "tokens_per_second_per_gpu": 10457.53, "total_tokens": 754032694 }, { "epoch": 0.4774318579644911, "grad_norm": 0.8996641039848328, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7637, "tokens_per_second_per_gpu": 10730.88, "total_tokens": 754130600 }, { "epoch": 0.47749437359339836, "grad_norm": 0.8654923439025879, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7638, "tokens_per_second_per_gpu": 10590.66, "total_tokens": 754230243 }, { "epoch": 0.4775568892223056, "grad_norm": 0.8769327402114868, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7639, "tokens_per_second_per_gpu": 9871.46, "total_tokens": 754326177 }, { "epoch": 0.4776194048512128, "grad_norm": 0.9182460308074951, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7640, "tokens_per_second_per_gpu": 10090.28, "total_tokens": 754420535 }, { "epoch": 0.47768192048012004, "grad_norm": 0.8922576904296875, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7641, "tokens_per_second_per_gpu": 10700.62, "total_tokens": 754520678 }, { "epoch": 0.4777444361090273, "grad_norm": 0.922693133354187, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7642, "tokens_per_second_per_gpu": 10218.48, "total_tokens": 754616681 }, { "epoch": 0.47780695173793447, "grad_norm": 0.8914116024971008, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7643, "tokens_per_second_per_gpu": 10571.06, "total_tokens": 754713551 }, { "epoch": 0.4778694673668417, "grad_norm": 0.8997825980186462, "learning_rate": 2e-05, "loss": 0.6556, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7644, "tokens_per_second_per_gpu": 10744.37, "total_tokens": 754815581 }, { "epoch": 0.47793198299574896, "grad_norm": 0.8425285816192627, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7645, "tokens_per_second_per_gpu": 10619.57, "total_tokens": 754915585 }, { "epoch": 0.47799449862465615, "grad_norm": 0.8506074547767639, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7646, "tokens_per_second_per_gpu": 10126.25, "total_tokens": 755014856 }, { "epoch": 0.4780570142535634, "grad_norm": 0.8689813613891602, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7647, "tokens_per_second_per_gpu": 10306.49, "total_tokens": 755112542 }, { "epoch": 0.47811952988247064, "grad_norm": 0.9000239968299866, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7648, "tokens_per_second_per_gpu": 10435.93, "total_tokens": 755209323 }, { "epoch": 0.47818204551137783, "grad_norm": 0.8774977326393127, "learning_rate": 2e-05, "loss": 0.6517, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7649, "tokens_per_second_per_gpu": 10300.41, "total_tokens": 755308869 }, { "epoch": 0.4782445611402851, "grad_norm": 0.8969681262969971, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7650, "tokens_per_second_per_gpu": 10433.03, "total_tokens": 755409462 }, { "epoch": 0.4783070767691923, "grad_norm": 0.8848411440849304, "learning_rate": 2e-05, "loss": 0.6405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7651, "tokens_per_second_per_gpu": 10702.33, "total_tokens": 755509888 }, { "epoch": 0.4783695923980995, "grad_norm": 0.9039781093597412, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7652, "tokens_per_second_per_gpu": 10720.29, "total_tokens": 755606349 }, { "epoch": 0.47843210802700675, "grad_norm": 0.9575942158699036, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7653, "tokens_per_second_per_gpu": 10527.79, "total_tokens": 755703636 }, { "epoch": 0.478494623655914, "grad_norm": 0.9114542007446289, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7654, "tokens_per_second_per_gpu": 10355.58, "total_tokens": 755800462 }, { "epoch": 0.4785571392848212, "grad_norm": 0.908561110496521, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7655, "tokens_per_second_per_gpu": 10559.11, "total_tokens": 755897413 }, { "epoch": 0.47861965491372843, "grad_norm": 0.8956286311149597, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7656, "tokens_per_second_per_gpu": 9343.94, "total_tokens": 755991855 }, { "epoch": 0.4786821705426357, "grad_norm": 0.9014763832092285, "learning_rate": 2e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7657, "tokens_per_second_per_gpu": 10652.02, "total_tokens": 756086494 }, { "epoch": 0.47874468617154287, "grad_norm": 0.8938537836074829, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7658, "tokens_per_second_per_gpu": 10403.79, "total_tokens": 756185150 }, { "epoch": 0.4788072018004501, "grad_norm": 0.9221410751342773, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7659, "tokens_per_second_per_gpu": 10163.22, "total_tokens": 756282476 }, { "epoch": 0.47886971742935736, "grad_norm": 0.8940111398696899, "learning_rate": 2e-05, "loss": 0.5858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7660, "tokens_per_second_per_gpu": 10540.88, "total_tokens": 756381216 }, { "epoch": 0.47893223305826454, "grad_norm": 0.9637782573699951, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7661, "tokens_per_second_per_gpu": 10071.5, "total_tokens": 756479518 }, { "epoch": 0.4789947486871718, "grad_norm": 0.8805311918258667, "learning_rate": 2e-05, "loss": 0.6478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7662, "tokens_per_second_per_gpu": 11475.22, "total_tokens": 756581785 }, { "epoch": 0.47905726431607903, "grad_norm": 0.8830519318580627, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7663, "tokens_per_second_per_gpu": 10884.35, "total_tokens": 756686004 }, { "epoch": 0.4791197799449862, "grad_norm": 0.9235560297966003, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7664, "tokens_per_second_per_gpu": 10644.67, "total_tokens": 756778198 }, { "epoch": 0.47918229557389347, "grad_norm": 0.9217670559883118, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7665, "tokens_per_second_per_gpu": 10489.98, "total_tokens": 756873891 }, { "epoch": 0.4792448112028007, "grad_norm": 0.8793909549713135, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7666, "tokens_per_second_per_gpu": 10006.54, "total_tokens": 756970519 }, { "epoch": 0.4793073268317079, "grad_norm": 0.8766996264457703, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7667, "tokens_per_second_per_gpu": 10028.43, "total_tokens": 757071217 }, { "epoch": 0.47936984246061515, "grad_norm": 0.9466307759284973, "learning_rate": 2e-05, "loss": 0.6607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7668, "tokens_per_second_per_gpu": 10575.84, "total_tokens": 757165414 }, { "epoch": 0.4794323580895224, "grad_norm": 0.9180787801742554, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7669, "tokens_per_second_per_gpu": 10571.68, "total_tokens": 757264159 }, { "epoch": 0.4794948737184296, "grad_norm": 0.9572882652282715, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7670, "tokens_per_second_per_gpu": 10603.16, "total_tokens": 757360937 }, { "epoch": 0.4795573893473368, "grad_norm": 0.8787346482276917, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7671, "tokens_per_second_per_gpu": 10615.83, "total_tokens": 757461958 }, { "epoch": 0.47961990497624407, "grad_norm": 0.8802913427352905, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7672, "tokens_per_second_per_gpu": 10954.15, "total_tokens": 757564325 }, { "epoch": 0.47968242060515126, "grad_norm": 0.9180651903152466, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7673, "tokens_per_second_per_gpu": 10425.77, "total_tokens": 757660191 }, { "epoch": 0.4797449362340585, "grad_norm": 0.9070793390274048, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7674, "tokens_per_second_per_gpu": 10719.61, "total_tokens": 757760917 }, { "epoch": 0.47980745186296575, "grad_norm": 0.9115498065948486, "learning_rate": 2e-05, "loss": 0.6582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7675, "tokens_per_second_per_gpu": 10496.92, "total_tokens": 757863376 }, { "epoch": 0.479869967491873, "grad_norm": 0.8785277605056763, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7676, "tokens_per_second_per_gpu": 11049.0, "total_tokens": 757965085 }, { "epoch": 0.4799324831207802, "grad_norm": 0.8850374221801758, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7677, "tokens_per_second_per_gpu": 10761.16, "total_tokens": 758067378 }, { "epoch": 0.47999499874968743, "grad_norm": 0.8921673893928528, "learning_rate": 2e-05, "loss": 0.6216, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7678, "tokens_per_second_per_gpu": 10970.31, "total_tokens": 758160582 }, { "epoch": 0.4800575143785947, "grad_norm": 0.9340184330940247, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7679, "tokens_per_second_per_gpu": 10972.9, "total_tokens": 758255513 }, { "epoch": 0.48012003000750186, "grad_norm": 0.9504029154777527, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7680, "tokens_per_second_per_gpu": 9996.82, "total_tokens": 758351961 }, { "epoch": 0.4801825456364091, "grad_norm": 0.8749567866325378, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7681, "tokens_per_second_per_gpu": 10548.02, "total_tokens": 758451222 }, { "epoch": 0.48024506126531635, "grad_norm": 0.904607355594635, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7682, "tokens_per_second_per_gpu": 10436.95, "total_tokens": 758552536 }, { "epoch": 0.48030757689422354, "grad_norm": 0.8805474638938904, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7683, "tokens_per_second_per_gpu": 10500.96, "total_tokens": 758652064 }, { "epoch": 0.4803700925231308, "grad_norm": 0.8789290189743042, "learning_rate": 2e-05, "loss": 0.5882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7684, "tokens_per_second_per_gpu": 10069.57, "total_tokens": 758747266 }, { "epoch": 0.48043260815203803, "grad_norm": 0.8964830040931702, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7685, "tokens_per_second_per_gpu": 10766.93, "total_tokens": 758846845 }, { "epoch": 0.4804951237809452, "grad_norm": 0.9030591249465942, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7686, "tokens_per_second_per_gpu": 10193.23, "total_tokens": 758943956 }, { "epoch": 0.48055763940985247, "grad_norm": 0.8532781004905701, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7687, "tokens_per_second_per_gpu": 11047.05, "total_tokens": 759042455 }, { "epoch": 0.4806201550387597, "grad_norm": 0.8791889548301697, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7688, "tokens_per_second_per_gpu": 11145.64, "total_tokens": 759144000 }, { "epoch": 0.4806826706676669, "grad_norm": 0.9044599533081055, "learning_rate": 2e-05, "loss": 0.6749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7689, "tokens_per_second_per_gpu": 10828.63, "total_tokens": 759243833 }, { "epoch": 0.48074518629657415, "grad_norm": 0.939721405506134, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7690, "tokens_per_second_per_gpu": 10404.98, "total_tokens": 759342638 }, { "epoch": 0.4808077019254814, "grad_norm": 0.8945826888084412, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7691, "tokens_per_second_per_gpu": 9729.38, "total_tokens": 759442919 }, { "epoch": 0.4808702175543886, "grad_norm": 0.8720382452011108, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7692, "tokens_per_second_per_gpu": 10533.63, "total_tokens": 759544138 }, { "epoch": 0.4809327331832958, "grad_norm": 0.9078208804130554, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7693, "tokens_per_second_per_gpu": 10989.22, "total_tokens": 759641596 }, { "epoch": 0.48099524881220307, "grad_norm": 0.9306880831718445, "learning_rate": 2e-05, "loss": 0.6875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7694, "tokens_per_second_per_gpu": 10979.59, "total_tokens": 759741470 }, { "epoch": 0.48105776444111026, "grad_norm": 0.8771485090255737, "learning_rate": 2e-05, "loss": 0.663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7695, "tokens_per_second_per_gpu": 10913.47, "total_tokens": 759845237 }, { "epoch": 0.4811202800700175, "grad_norm": 0.8753995895385742, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7696, "tokens_per_second_per_gpu": 10274.87, "total_tokens": 759943441 }, { "epoch": 0.48118279569892475, "grad_norm": 0.8691431879997253, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7697, "tokens_per_second_per_gpu": 10588.64, "total_tokens": 760043471 }, { "epoch": 0.48124531132783194, "grad_norm": 0.9464024305343628, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7698, "tokens_per_second_per_gpu": 10627.09, "total_tokens": 760139118 }, { "epoch": 0.4813078269567392, "grad_norm": 0.8755336999893188, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7699, "tokens_per_second_per_gpu": 9905.88, "total_tokens": 760236854 }, { "epoch": 0.4813703425856464, "grad_norm": 0.8915502429008484, "learning_rate": 2e-05, "loss": 0.6741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7700, "tokens_per_second_per_gpu": 10545.43, "total_tokens": 760336197 }, { "epoch": 0.4814328582145536, "grad_norm": 0.8902663588523865, "learning_rate": 2e-05, "loss": 0.6869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7701, "tokens_per_second_per_gpu": 10847.1, "total_tokens": 760435097 }, { "epoch": 0.48149537384346086, "grad_norm": 0.9414408802986145, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7702, "tokens_per_second_per_gpu": 11461.9, "total_tokens": 760534424 }, { "epoch": 0.4815578894723681, "grad_norm": 0.9061031937599182, "learning_rate": 2e-05, "loss": 0.6514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7703, "tokens_per_second_per_gpu": 10541.95, "total_tokens": 760633990 }, { "epoch": 0.4816204051012753, "grad_norm": 0.9317144751548767, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7704, "tokens_per_second_per_gpu": 11000.4, "total_tokens": 760735613 }, { "epoch": 0.48168292073018254, "grad_norm": 0.9282301664352417, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7705, "tokens_per_second_per_gpu": 10862.84, "total_tokens": 760836443 }, { "epoch": 0.4817454363590898, "grad_norm": 0.8883500099182129, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7706, "tokens_per_second_per_gpu": 10766.94, "total_tokens": 760937519 }, { "epoch": 0.481807951987997, "grad_norm": 0.9165394306182861, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7707, "tokens_per_second_per_gpu": 10172.14, "total_tokens": 761032350 }, { "epoch": 0.4818704676169042, "grad_norm": 0.9728252291679382, "learning_rate": 2e-05, "loss": 0.6713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7708, "tokens_per_second_per_gpu": 11033.45, "total_tokens": 761133897 }, { "epoch": 0.48193298324581146, "grad_norm": 0.9172244071960449, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7709, "tokens_per_second_per_gpu": 10440.0, "total_tokens": 761231060 }, { "epoch": 0.48199549887471865, "grad_norm": 0.8604857921600342, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7710, "tokens_per_second_per_gpu": 10916.97, "total_tokens": 761330403 }, { "epoch": 0.4820580145036259, "grad_norm": 0.8844853043556213, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7711, "tokens_per_second_per_gpu": 10562.02, "total_tokens": 761430875 }, { "epoch": 0.48212053013253314, "grad_norm": 0.8421915769577026, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7712, "tokens_per_second_per_gpu": 10907.92, "total_tokens": 761533961 }, { "epoch": 0.4821830457614404, "grad_norm": 0.8719865083694458, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7713, "tokens_per_second_per_gpu": 10759.42, "total_tokens": 761638395 }, { "epoch": 0.4822455613903476, "grad_norm": 0.8848496675491333, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7714, "tokens_per_second_per_gpu": 10292.1, "total_tokens": 761731759 }, { "epoch": 0.4823080770192548, "grad_norm": 0.8686614036560059, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7715, "tokens_per_second_per_gpu": 10185.54, "total_tokens": 761831343 }, { "epoch": 0.48237059264816207, "grad_norm": 0.877765417098999, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7716, "tokens_per_second_per_gpu": 11223.35, "total_tokens": 761931517 }, { "epoch": 0.48243310827706926, "grad_norm": 0.8886361122131348, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7717, "tokens_per_second_per_gpu": 10285.46, "total_tokens": 762028573 }, { "epoch": 0.4824956239059765, "grad_norm": 0.8933610320091248, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7718, "tokens_per_second_per_gpu": 9703.87, "total_tokens": 762120888 }, { "epoch": 0.48255813953488375, "grad_norm": 0.8873981237411499, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7719, "tokens_per_second_per_gpu": 9918.57, "total_tokens": 762215236 }, { "epoch": 0.48262065516379093, "grad_norm": 0.9029974341392517, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7720, "tokens_per_second_per_gpu": 10604.85, "total_tokens": 762311416 }, { "epoch": 0.4826831707926982, "grad_norm": 0.8746685981750488, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7721, "tokens_per_second_per_gpu": 11162.72, "total_tokens": 762412014 }, { "epoch": 0.4827456864216054, "grad_norm": 0.8710156083106995, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7722, "tokens_per_second_per_gpu": 10023.94, "total_tokens": 762507864 }, { "epoch": 0.4828082020505126, "grad_norm": 0.8732719421386719, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7723, "tokens_per_second_per_gpu": 10591.67, "total_tokens": 762609178 }, { "epoch": 0.48287071767941986, "grad_norm": 0.899739146232605, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7724, "tokens_per_second_per_gpu": 9841.17, "total_tokens": 762705364 }, { "epoch": 0.4829332333083271, "grad_norm": 0.9331403970718384, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7725, "tokens_per_second_per_gpu": 10007.7, "total_tokens": 762794938 }, { "epoch": 0.4829957489372343, "grad_norm": 0.9272019863128662, "learning_rate": 2e-05, "loss": 0.6957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7726, "tokens_per_second_per_gpu": 10207.14, "total_tokens": 762892691 }, { "epoch": 0.48305826456614154, "grad_norm": 0.8781391382217407, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7727, "tokens_per_second_per_gpu": 10816.42, "total_tokens": 762992209 }, { "epoch": 0.4831207801950488, "grad_norm": 0.8998309969902039, "learning_rate": 2e-05, "loss": 0.6576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7728, "tokens_per_second_per_gpu": 9853.8, "total_tokens": 763090296 }, { "epoch": 0.48318329582395597, "grad_norm": 0.8784880042076111, "learning_rate": 2e-05, "loss": 0.6658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7729, "tokens_per_second_per_gpu": 11280.85, "total_tokens": 763195235 }, { "epoch": 0.4832458114528632, "grad_norm": 0.89974045753479, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7730, "tokens_per_second_per_gpu": 10039.94, "total_tokens": 763292139 }, { "epoch": 0.48330832708177046, "grad_norm": 0.9259485602378845, "learning_rate": 2e-05, "loss": 0.6938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7731, "tokens_per_second_per_gpu": 9728.8, "total_tokens": 763388206 }, { "epoch": 0.48337084271067765, "grad_norm": 0.9129778742790222, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7732, "tokens_per_second_per_gpu": 10508.6, "total_tokens": 763487396 }, { "epoch": 0.4834333583395849, "grad_norm": 0.9283603429794312, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7733, "tokens_per_second_per_gpu": 10446.39, "total_tokens": 763586932 }, { "epoch": 0.48349587396849214, "grad_norm": 0.8983643054962158, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7734, "tokens_per_second_per_gpu": 10559.99, "total_tokens": 763682723 }, { "epoch": 0.48355838959739933, "grad_norm": 0.9655348658561707, "learning_rate": 2e-05, "loss": 0.6582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7735, "tokens_per_second_per_gpu": 10840.41, "total_tokens": 763781439 }, { "epoch": 0.4836209052263066, "grad_norm": 0.8899997472763062, "learning_rate": 2e-05, "loss": 0.671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7736, "tokens_per_second_per_gpu": 10619.95, "total_tokens": 763882733 }, { "epoch": 0.4836834208552138, "grad_norm": 0.8879734873771667, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7737, "tokens_per_second_per_gpu": 10579.17, "total_tokens": 763982546 }, { "epoch": 0.483745936484121, "grad_norm": 0.880759060382843, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7738, "tokens_per_second_per_gpu": 10495.08, "total_tokens": 764079218 }, { "epoch": 0.48380845211302825, "grad_norm": 0.8524923324584961, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7739, "tokens_per_second_per_gpu": 11305.73, "total_tokens": 764180713 }, { "epoch": 0.4838709677419355, "grad_norm": 0.8879362940788269, "learning_rate": 2e-05, "loss": 0.6635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7740, "tokens_per_second_per_gpu": 10455.84, "total_tokens": 764282259 }, { "epoch": 0.4839334833708427, "grad_norm": 0.9199859499931335, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7741, "tokens_per_second_per_gpu": 9748.28, "total_tokens": 764375287 }, { "epoch": 0.48399599899974993, "grad_norm": 0.9148202538490295, "learning_rate": 2e-05, "loss": 0.6661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7742, "tokens_per_second_per_gpu": 10488.56, "total_tokens": 764475188 }, { "epoch": 0.4840585146286572, "grad_norm": 0.9286385774612427, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7743, "tokens_per_second_per_gpu": 10716.94, "total_tokens": 764575395 }, { "epoch": 0.48412103025756437, "grad_norm": 0.8988038301467896, "learning_rate": 2e-05, "loss": 0.5897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7744, "tokens_per_second_per_gpu": 10320.53, "total_tokens": 764670869 }, { "epoch": 0.4841835458864716, "grad_norm": 0.8853898048400879, "learning_rate": 2e-05, "loss": 0.6487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7745, "tokens_per_second_per_gpu": 11118.28, "total_tokens": 764775249 }, { "epoch": 0.48424606151537886, "grad_norm": 0.8760208487510681, "learning_rate": 2e-05, "loss": 0.6623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7746, "tokens_per_second_per_gpu": 11295.1, "total_tokens": 764879196 }, { "epoch": 0.48430857714428605, "grad_norm": 0.9195184707641602, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7747, "tokens_per_second_per_gpu": 10721.57, "total_tokens": 764978567 }, { "epoch": 0.4843710927731933, "grad_norm": 0.8645285964012146, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7748, "tokens_per_second_per_gpu": 11190.67, "total_tokens": 765079863 }, { "epoch": 0.48443360840210054, "grad_norm": 0.8673414587974548, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7749, "tokens_per_second_per_gpu": 10521.6, "total_tokens": 765177556 }, { "epoch": 0.4844961240310077, "grad_norm": 0.8717751502990723, "learning_rate": 2e-05, "loss": 0.6654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7750, "tokens_per_second_per_gpu": 10173.65, "total_tokens": 765276958 }, { "epoch": 0.48455863965991497, "grad_norm": 0.8926973342895508, "learning_rate": 2e-05, "loss": 0.6795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7751, "tokens_per_second_per_gpu": 9846.2, "total_tokens": 765374806 }, { "epoch": 0.4846211552888222, "grad_norm": 0.9176138639450073, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7752, "tokens_per_second_per_gpu": 10522.89, "total_tokens": 765473781 }, { "epoch": 0.48468367091772946, "grad_norm": 0.9391577839851379, "learning_rate": 2e-05, "loss": 0.6798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7753, "tokens_per_second_per_gpu": 10549.33, "total_tokens": 765570541 }, { "epoch": 0.48474618654663665, "grad_norm": 0.8885636329650879, "learning_rate": 2e-05, "loss": 0.6742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7754, "tokens_per_second_per_gpu": 10438.83, "total_tokens": 765669810 }, { "epoch": 0.4848087021755439, "grad_norm": 0.8768271803855896, "learning_rate": 2e-05, "loss": 0.6365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7755, "tokens_per_second_per_gpu": 11061.47, "total_tokens": 765770894 }, { "epoch": 0.48487121780445114, "grad_norm": 0.9072020053863525, "learning_rate": 2e-05, "loss": 0.6773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7756, "tokens_per_second_per_gpu": 11373.7, "total_tokens": 765872677 }, { "epoch": 0.4849337334333583, "grad_norm": 1.0443018674850464, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7757, "tokens_per_second_per_gpu": 9700.64, "total_tokens": 765968896 }, { "epoch": 0.48499624906226557, "grad_norm": 0.8700398802757263, "learning_rate": 2e-05, "loss": 0.6554, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7758, "tokens_per_second_per_gpu": 11614.22, "total_tokens": 766072730 }, { "epoch": 0.4850587646911728, "grad_norm": 0.9756271839141846, "learning_rate": 2e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7759, "tokens_per_second_per_gpu": 10672.24, "total_tokens": 766170385 }, { "epoch": 0.48512128032008, "grad_norm": 0.8810422420501709, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7760, "tokens_per_second_per_gpu": 10936.2, "total_tokens": 766272141 }, { "epoch": 0.48518379594898725, "grad_norm": 0.9056726694107056, "learning_rate": 2e-05, "loss": 0.6486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7761, "tokens_per_second_per_gpu": 10770.15, "total_tokens": 766371443 }, { "epoch": 0.4852463115778945, "grad_norm": 0.9059341549873352, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7762, "tokens_per_second_per_gpu": 10962.65, "total_tokens": 766473607 }, { "epoch": 0.4853088272068017, "grad_norm": 0.8941859602928162, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7763, "tokens_per_second_per_gpu": 10366.19, "total_tokens": 766567539 }, { "epoch": 0.48537134283570893, "grad_norm": 0.8650130033493042, "learning_rate": 2e-05, "loss": 0.6596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7764, "tokens_per_second_per_gpu": 10417.7, "total_tokens": 766668894 }, { "epoch": 0.4854338584646162, "grad_norm": 0.8680990934371948, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7765, "tokens_per_second_per_gpu": 9712.86, "total_tokens": 766764363 }, { "epoch": 0.48549637409352336, "grad_norm": 0.8840035796165466, "learning_rate": 2e-05, "loss": 0.6766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7766, "tokens_per_second_per_gpu": 10724.17, "total_tokens": 766865433 }, { "epoch": 0.4855588897224306, "grad_norm": 0.9096989631652832, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7767, "tokens_per_second_per_gpu": 10830.8, "total_tokens": 766965307 }, { "epoch": 0.48562140535133785, "grad_norm": 0.8744066953659058, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7768, "tokens_per_second_per_gpu": 11087.32, "total_tokens": 767066065 }, { "epoch": 0.48568392098024504, "grad_norm": 0.895592212677002, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7769, "tokens_per_second_per_gpu": 9837.58, "total_tokens": 767161739 }, { "epoch": 0.4857464366091523, "grad_norm": 0.8718828558921814, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7770, "tokens_per_second_per_gpu": 10495.55, "total_tokens": 767260096 }, { "epoch": 0.48580895223805953, "grad_norm": 0.8859180212020874, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7771, "tokens_per_second_per_gpu": 10294.89, "total_tokens": 767359322 }, { "epoch": 0.4858714678669667, "grad_norm": 0.8795633912086487, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7772, "tokens_per_second_per_gpu": 11186.43, "total_tokens": 767460218 }, { "epoch": 0.48593398349587397, "grad_norm": 0.9238924384117126, "learning_rate": 2e-05, "loss": 0.6425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7773, "tokens_per_second_per_gpu": 11150.03, "total_tokens": 767559503 }, { "epoch": 0.4859964991247812, "grad_norm": 0.9119183421134949, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7774, "tokens_per_second_per_gpu": 9834.45, "total_tokens": 767657114 }, { "epoch": 0.4860590147536884, "grad_norm": 0.8668271899223328, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7775, "tokens_per_second_per_gpu": 11282.26, "total_tokens": 767758472 }, { "epoch": 0.48612153038259565, "grad_norm": 0.9074311852455139, "learning_rate": 2e-05, "loss": 0.6582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7776, "tokens_per_second_per_gpu": 9849.13, "total_tokens": 767857759 }, { "epoch": 0.4861840460115029, "grad_norm": 0.8970004916191101, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7777, "tokens_per_second_per_gpu": 10867.14, "total_tokens": 767957432 }, { "epoch": 0.4862465616404101, "grad_norm": 0.8548848032951355, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7778, "tokens_per_second_per_gpu": 10585.99, "total_tokens": 768057783 }, { "epoch": 0.4863090772693173, "grad_norm": 0.8802014589309692, "learning_rate": 2e-05, "loss": 0.6461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7779, "tokens_per_second_per_gpu": 10161.93, "total_tokens": 768157435 }, { "epoch": 0.48637159289822457, "grad_norm": 0.8842684626579285, "learning_rate": 2e-05, "loss": 0.6684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7780, "tokens_per_second_per_gpu": 10856.04, "total_tokens": 768257847 }, { "epoch": 0.48643410852713176, "grad_norm": 0.8967797756195068, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7781, "tokens_per_second_per_gpu": 10336.39, "total_tokens": 768355316 }, { "epoch": 0.486496624156039, "grad_norm": 0.9106559157371521, "learning_rate": 2e-05, "loss": 0.6916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7782, "tokens_per_second_per_gpu": 10632.87, "total_tokens": 768452144 }, { "epoch": 0.48655913978494625, "grad_norm": 0.8833937048912048, "learning_rate": 2e-05, "loss": 0.6869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7783, "tokens_per_second_per_gpu": 11137.21, "total_tokens": 768556054 }, { "epoch": 0.48662165541385344, "grad_norm": 0.921855628490448, "learning_rate": 2e-05, "loss": 0.7047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7784, "tokens_per_second_per_gpu": 10801.98, "total_tokens": 768656345 }, { "epoch": 0.4866841710427607, "grad_norm": 0.8781829476356506, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7785, "tokens_per_second_per_gpu": 10500.54, "total_tokens": 768755376 }, { "epoch": 0.4867466866716679, "grad_norm": 0.9137756824493408, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7786, "tokens_per_second_per_gpu": 10600.19, "total_tokens": 768859213 }, { "epoch": 0.4868092023005751, "grad_norm": 0.8873723745346069, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7787, "tokens_per_second_per_gpu": 10150.95, "total_tokens": 768955148 }, { "epoch": 0.48687171792948236, "grad_norm": 0.8825017213821411, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7788, "tokens_per_second_per_gpu": 10112.28, "total_tokens": 769053993 }, { "epoch": 0.4869342335583896, "grad_norm": 1.0235285758972168, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7789, "tokens_per_second_per_gpu": 11343.66, "total_tokens": 769152323 }, { "epoch": 0.48699674918729685, "grad_norm": 0.9063613414764404, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7790, "tokens_per_second_per_gpu": 10345.69, "total_tokens": 769251619 }, { "epoch": 0.48705926481620404, "grad_norm": 0.9265342354774475, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7791, "tokens_per_second_per_gpu": 11290.79, "total_tokens": 769355593 }, { "epoch": 0.4871217804451113, "grad_norm": 0.9467138648033142, "learning_rate": 2e-05, "loss": 0.6168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7792, "tokens_per_second_per_gpu": 9673.14, "total_tokens": 769451994 }, { "epoch": 0.48718429607401853, "grad_norm": 0.892070472240448, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7793, "tokens_per_second_per_gpu": 9351.61, "total_tokens": 769545910 }, { "epoch": 0.4872468117029257, "grad_norm": 0.8996099829673767, "learning_rate": 2e-05, "loss": 0.6677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7794, "tokens_per_second_per_gpu": 10236.79, "total_tokens": 769644499 }, { "epoch": 0.48730932733183296, "grad_norm": 0.8793346881866455, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7795, "tokens_per_second_per_gpu": 11155.77, "total_tokens": 769745761 }, { "epoch": 0.4873718429607402, "grad_norm": 0.9015783667564392, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7796, "tokens_per_second_per_gpu": 10330.7, "total_tokens": 769841425 }, { "epoch": 0.4874343585896474, "grad_norm": 0.9032612442970276, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7797, "tokens_per_second_per_gpu": 11177.01, "total_tokens": 769942793 }, { "epoch": 0.48749687421855464, "grad_norm": 0.8894422054290771, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7798, "tokens_per_second_per_gpu": 10381.96, "total_tokens": 770038257 }, { "epoch": 0.4875593898474619, "grad_norm": 0.8735671043395996, "learning_rate": 2e-05, "loss": 0.6646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7799, "tokens_per_second_per_gpu": 11030.33, "total_tokens": 770139984 }, { "epoch": 0.4876219054763691, "grad_norm": 0.8766728043556213, "learning_rate": 2e-05, "loss": 0.6604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7800, "tokens_per_second_per_gpu": 10853.61, "total_tokens": 770241650 }, { "epoch": 0.4876844211052763, "grad_norm": 0.8900845646858215, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7801, "tokens_per_second_per_gpu": 11217.39, "total_tokens": 770343531 }, { "epoch": 0.48774693673418357, "grad_norm": 0.8950343132019043, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7802, "tokens_per_second_per_gpu": 11282.92, "total_tokens": 770445564 }, { "epoch": 0.48780945236309076, "grad_norm": 1.0143916606903076, "learning_rate": 2e-05, "loss": 0.6762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7803, "tokens_per_second_per_gpu": 9924.91, "total_tokens": 770543274 }, { "epoch": 0.487871967991998, "grad_norm": 0.9406089186668396, "learning_rate": 2e-05, "loss": 0.6694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7804, "tokens_per_second_per_gpu": 10827.36, "total_tokens": 770641888 }, { "epoch": 0.48793448362090525, "grad_norm": 0.9224584698677063, "learning_rate": 2e-05, "loss": 0.6835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7805, "tokens_per_second_per_gpu": 10045.14, "total_tokens": 770738049 }, { "epoch": 0.48799699924981244, "grad_norm": 0.9129244089126587, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7806, "tokens_per_second_per_gpu": 10728.72, "total_tokens": 770835213 }, { "epoch": 0.4880595148787197, "grad_norm": 0.9095990061759949, "learning_rate": 2e-05, "loss": 0.6655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7807, "tokens_per_second_per_gpu": 10856.25, "total_tokens": 770933498 }, { "epoch": 0.4881220305076269, "grad_norm": 0.8748508095741272, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7808, "tokens_per_second_per_gpu": 10719.31, "total_tokens": 771032192 }, { "epoch": 0.4881845461365341, "grad_norm": 0.9275594353675842, "learning_rate": 2e-05, "loss": 0.6596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7809, "tokens_per_second_per_gpu": 9873.66, "total_tokens": 771128720 }, { "epoch": 0.48824706176544136, "grad_norm": 0.8854953646659851, "learning_rate": 2e-05, "loss": 0.6744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7810, "tokens_per_second_per_gpu": 11211.33, "total_tokens": 771228429 }, { "epoch": 0.4883095773943486, "grad_norm": 0.9025653600692749, "learning_rate": 2e-05, "loss": 0.6564, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7811, "tokens_per_second_per_gpu": 10674.41, "total_tokens": 771329730 }, { "epoch": 0.4883720930232558, "grad_norm": 0.8873912692070007, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7812, "tokens_per_second_per_gpu": 10382.76, "total_tokens": 771431192 }, { "epoch": 0.48843460865216304, "grad_norm": 0.9551230669021606, "learning_rate": 2e-05, "loss": 0.6045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7813, "tokens_per_second_per_gpu": 10522.09, "total_tokens": 771527999 }, { "epoch": 0.4884971242810703, "grad_norm": 0.8862378597259521, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7814, "tokens_per_second_per_gpu": 10585.8, "total_tokens": 771632230 }, { "epoch": 0.4885596399099775, "grad_norm": 0.9013447761535645, "learning_rate": 2e-05, "loss": 0.6761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7815, "tokens_per_second_per_gpu": 10936.61, "total_tokens": 771737975 }, { "epoch": 0.4886221555388847, "grad_norm": 0.8609643578529358, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7816, "tokens_per_second_per_gpu": 10791.61, "total_tokens": 771840752 }, { "epoch": 0.48868467116779196, "grad_norm": 0.9154318571090698, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7817, "tokens_per_second_per_gpu": 10706.1, "total_tokens": 771939533 }, { "epoch": 0.48874718679669915, "grad_norm": 0.914037823677063, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7818, "tokens_per_second_per_gpu": 10340.38, "total_tokens": 772033006 }, { "epoch": 0.4888097024256064, "grad_norm": 0.8903025388717651, "learning_rate": 2e-05, "loss": 0.6842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7819, "tokens_per_second_per_gpu": 10919.05, "total_tokens": 772136353 }, { "epoch": 0.48887221805451364, "grad_norm": 0.907414436340332, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7820, "tokens_per_second_per_gpu": 10971.09, "total_tokens": 772229757 }, { "epoch": 0.48893473368342083, "grad_norm": 0.8967357873916626, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7821, "tokens_per_second_per_gpu": 9557.06, "total_tokens": 772326525 }, { "epoch": 0.4889972493123281, "grad_norm": 0.8833682537078857, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7822, "tokens_per_second_per_gpu": 10240.33, "total_tokens": 772426330 }, { "epoch": 0.4890597649412353, "grad_norm": 0.8769385814666748, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7823, "tokens_per_second_per_gpu": 10541.39, "total_tokens": 772527450 }, { "epoch": 0.4891222805701425, "grad_norm": 0.8907370567321777, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7824, "tokens_per_second_per_gpu": 10413.71, "total_tokens": 772626794 }, { "epoch": 0.48918479619904975, "grad_norm": 0.8842390775680542, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7825, "tokens_per_second_per_gpu": 10571.21, "total_tokens": 772726183 }, { "epoch": 0.489247311827957, "grad_norm": 0.8837008476257324, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7826, "tokens_per_second_per_gpu": 10469.16, "total_tokens": 772829092 }, { "epoch": 0.48930982745686424, "grad_norm": 0.8904192447662354, "learning_rate": 2e-05, "loss": 0.6784, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7827, "tokens_per_second_per_gpu": 11164.1, "total_tokens": 772931874 }, { "epoch": 0.48937234308577143, "grad_norm": 0.8973207473754883, "learning_rate": 2e-05, "loss": 0.6483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7828, "tokens_per_second_per_gpu": 10399.69, "total_tokens": 773033688 }, { "epoch": 0.4894348587146787, "grad_norm": 0.8768648505210876, "learning_rate": 2e-05, "loss": 0.6812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7829, "tokens_per_second_per_gpu": 10647.06, "total_tokens": 773138968 }, { "epoch": 0.4894973743435859, "grad_norm": 0.8839282989501953, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7830, "tokens_per_second_per_gpu": 10349.82, "total_tokens": 773236510 }, { "epoch": 0.4895598899724931, "grad_norm": 0.8481414914131165, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7831, "tokens_per_second_per_gpu": 10866.81, "total_tokens": 773337432 }, { "epoch": 0.48962240560140036, "grad_norm": 0.9055315256118774, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7832, "tokens_per_second_per_gpu": 10213.56, "total_tokens": 773437004 }, { "epoch": 0.4896849212303076, "grad_norm": 0.9133355021476746, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7833, "tokens_per_second_per_gpu": 10379.72, "total_tokens": 773535406 }, { "epoch": 0.4897474368592148, "grad_norm": 0.9188968539237976, "learning_rate": 2e-05, "loss": 0.6862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7834, "tokens_per_second_per_gpu": 10568.62, "total_tokens": 773633767 }, { "epoch": 0.48980995248812204, "grad_norm": 0.8921571969985962, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7835, "tokens_per_second_per_gpu": 10619.58, "total_tokens": 773730003 }, { "epoch": 0.4898724681170293, "grad_norm": 0.8972142338752747, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7836, "tokens_per_second_per_gpu": 11704.6, "total_tokens": 773834268 }, { "epoch": 0.48993498374593647, "grad_norm": 0.8889122009277344, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7837, "tokens_per_second_per_gpu": 10548.44, "total_tokens": 773936990 }, { "epoch": 0.4899974993748437, "grad_norm": 0.9250960350036621, "learning_rate": 2e-05, "loss": 0.695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7838, "tokens_per_second_per_gpu": 10481.48, "total_tokens": 774038394 }, { "epoch": 0.49006001500375096, "grad_norm": 0.8820453882217407, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7839, "tokens_per_second_per_gpu": 14475.05, "total_tokens": 774133977 }, { "epoch": 0.49012253063265815, "grad_norm": 0.9335874915122986, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7840, "tokens_per_second_per_gpu": 10477.1, "total_tokens": 774231722 }, { "epoch": 0.4901850462615654, "grad_norm": 0.8844954371452332, "learning_rate": 2e-05, "loss": 0.6566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7841, "tokens_per_second_per_gpu": 10609.73, "total_tokens": 774329190 }, { "epoch": 0.49024756189047264, "grad_norm": 0.8680281043052673, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7842, "tokens_per_second_per_gpu": 11110.75, "total_tokens": 774429832 }, { "epoch": 0.4903100775193798, "grad_norm": 0.9093083739280701, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7843, "tokens_per_second_per_gpu": 9750.81, "total_tokens": 774523927 }, { "epoch": 0.4903725931482871, "grad_norm": 0.90659499168396, "learning_rate": 2e-05, "loss": 0.6757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7844, "tokens_per_second_per_gpu": 10719.58, "total_tokens": 774625636 }, { "epoch": 0.4904351087771943, "grad_norm": 0.9240081310272217, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7845, "tokens_per_second_per_gpu": 9556.96, "total_tokens": 774718305 }, { "epoch": 0.4904976244061015, "grad_norm": 0.8926538825035095, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7846, "tokens_per_second_per_gpu": 10611.5, "total_tokens": 774814656 }, { "epoch": 0.49056014003500875, "grad_norm": 0.8688790798187256, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7847, "tokens_per_second_per_gpu": 10281.88, "total_tokens": 774913305 }, { "epoch": 0.490622655663916, "grad_norm": 0.9302972555160522, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7848, "tokens_per_second_per_gpu": 11181.97, "total_tokens": 775013701 }, { "epoch": 0.4906851712928232, "grad_norm": 0.8693909645080566, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7849, "tokens_per_second_per_gpu": 10539.3, "total_tokens": 775113157 }, { "epoch": 0.49074768692173043, "grad_norm": 0.9000561833381653, "learning_rate": 2e-05, "loss": 0.6458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7850, "tokens_per_second_per_gpu": 10522.49, "total_tokens": 775205800 }, { "epoch": 0.4908102025506377, "grad_norm": 0.9417040944099426, "learning_rate": 2e-05, "loss": 0.6808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7851, "tokens_per_second_per_gpu": 10745.98, "total_tokens": 775303061 }, { "epoch": 0.49087271817954486, "grad_norm": 0.8692560791969299, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7852, "tokens_per_second_per_gpu": 10059.69, "total_tokens": 775400684 }, { "epoch": 0.4909352338084521, "grad_norm": 0.9055156111717224, "learning_rate": 2e-05, "loss": 0.6483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7853, "tokens_per_second_per_gpu": 11039.8, "total_tokens": 775503105 }, { "epoch": 0.49099774943735935, "grad_norm": 0.9115765690803528, "learning_rate": 2e-05, "loss": 0.6022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7854, "tokens_per_second_per_gpu": 10137.08, "total_tokens": 775595034 }, { "epoch": 0.49106026506626654, "grad_norm": 0.9220922589302063, "learning_rate": 2e-05, "loss": 0.6735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7855, "tokens_per_second_per_gpu": 10337.83, "total_tokens": 775693811 }, { "epoch": 0.4911227806951738, "grad_norm": 0.881527304649353, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7856, "tokens_per_second_per_gpu": 11257.89, "total_tokens": 775795760 }, { "epoch": 0.49118529632408103, "grad_norm": 0.890766978263855, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7857, "tokens_per_second_per_gpu": 10353.5, "total_tokens": 775892446 }, { "epoch": 0.4912478119529882, "grad_norm": 0.9021545052528381, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7858, "tokens_per_second_per_gpu": 10407.74, "total_tokens": 775989319 }, { "epoch": 0.49131032758189547, "grad_norm": 0.8919277787208557, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7859, "tokens_per_second_per_gpu": 10466.9, "total_tokens": 776087864 }, { "epoch": 0.4913728432108027, "grad_norm": 0.8798649907112122, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7860, "tokens_per_second_per_gpu": 10079.08, "total_tokens": 776188652 }, { "epoch": 0.4914353588397099, "grad_norm": 0.8629424571990967, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7861, "tokens_per_second_per_gpu": 9709.55, "total_tokens": 776286788 }, { "epoch": 0.49149787446861715, "grad_norm": 0.8887308835983276, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7862, "tokens_per_second_per_gpu": 10341.22, "total_tokens": 776380968 }, { "epoch": 0.4915603900975244, "grad_norm": 0.9168233871459961, "learning_rate": 2e-05, "loss": 0.6572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7863, "tokens_per_second_per_gpu": 10276.4, "total_tokens": 776479387 }, { "epoch": 0.4916229057264316, "grad_norm": 0.9069377779960632, "learning_rate": 2e-05, "loss": 0.663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7864, "tokens_per_second_per_gpu": 11262.47, "total_tokens": 776580220 }, { "epoch": 0.4916854213553388, "grad_norm": 0.9085519313812256, "learning_rate": 2e-05, "loss": 0.6628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7865, "tokens_per_second_per_gpu": 10726.71, "total_tokens": 776675558 }, { "epoch": 0.49174793698424607, "grad_norm": 0.9404191970825195, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7866, "tokens_per_second_per_gpu": 9671.36, "total_tokens": 776768321 }, { "epoch": 0.4918104526131533, "grad_norm": 0.9085592031478882, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7867, "tokens_per_second_per_gpu": 10128.45, "total_tokens": 776866765 }, { "epoch": 0.4918729682420605, "grad_norm": 0.9708207249641418, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7868, "tokens_per_second_per_gpu": 10537.98, "total_tokens": 776965293 }, { "epoch": 0.49193548387096775, "grad_norm": 0.8883028030395508, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7869, "tokens_per_second_per_gpu": 9768.43, "total_tokens": 777060532 }, { "epoch": 0.491997999499875, "grad_norm": 0.934012770652771, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7870, "tokens_per_second_per_gpu": 10396.48, "total_tokens": 777160174 }, { "epoch": 0.4920605151287822, "grad_norm": 0.9283170104026794, "learning_rate": 2e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7871, "tokens_per_second_per_gpu": 11125.48, "total_tokens": 777262683 }, { "epoch": 0.49212303075768943, "grad_norm": 0.8946043252944946, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7872, "tokens_per_second_per_gpu": 11033.88, "total_tokens": 777365877 }, { "epoch": 0.4921855463865967, "grad_norm": 0.8521011471748352, "learning_rate": 2e-05, "loss": 0.6729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7873, "tokens_per_second_per_gpu": 11088.9, "total_tokens": 777469623 }, { "epoch": 0.49224806201550386, "grad_norm": 0.8733890056610107, "learning_rate": 2e-05, "loss": 0.6682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7874, "tokens_per_second_per_gpu": 11181.68, "total_tokens": 777571622 }, { "epoch": 0.4923105776444111, "grad_norm": 0.9399077296257019, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7875, "tokens_per_second_per_gpu": 10816.25, "total_tokens": 777665731 }, { "epoch": 0.49237309327331835, "grad_norm": 0.8705604076385498, "learning_rate": 2e-05, "loss": 0.6364, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7876, "tokens_per_second_per_gpu": 10368.19, "total_tokens": 777764685 }, { "epoch": 0.49243560890222554, "grad_norm": 0.904815137386322, "learning_rate": 2e-05, "loss": 0.671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7877, "tokens_per_second_per_gpu": 10268.01, "total_tokens": 777862900 }, { "epoch": 0.4924981245311328, "grad_norm": 0.8760277628898621, "learning_rate": 2e-05, "loss": 0.6593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7878, "tokens_per_second_per_gpu": 10825.24, "total_tokens": 777965620 }, { "epoch": 0.49256064016004003, "grad_norm": 0.9072049856185913, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7879, "tokens_per_second_per_gpu": 9282.92, "total_tokens": 778059538 }, { "epoch": 0.4926231557889472, "grad_norm": 0.9155586361885071, "learning_rate": 2e-05, "loss": 0.6653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7880, "tokens_per_second_per_gpu": 9995.01, "total_tokens": 778157577 }, { "epoch": 0.49268567141785446, "grad_norm": 0.9011879563331604, "learning_rate": 2e-05, "loss": 0.6748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7881, "tokens_per_second_per_gpu": 10939.71, "total_tokens": 778261384 }, { "epoch": 0.4927481870467617, "grad_norm": 0.9263227581977844, "learning_rate": 2e-05, "loss": 0.6909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7882, "tokens_per_second_per_gpu": 10677.63, "total_tokens": 778361580 }, { "epoch": 0.4928107026756689, "grad_norm": 0.883935809135437, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7883, "tokens_per_second_per_gpu": 10513.99, "total_tokens": 778460296 }, { "epoch": 0.49287321830457614, "grad_norm": 0.8823403120040894, "learning_rate": 2e-05, "loss": 0.6432, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7884, "tokens_per_second_per_gpu": 10369.7, "total_tokens": 778561181 }, { "epoch": 0.4929357339334834, "grad_norm": 0.9079756140708923, "learning_rate": 2e-05, "loss": 0.6508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7885, "tokens_per_second_per_gpu": 10394.19, "total_tokens": 778658962 }, { "epoch": 0.4929982495623906, "grad_norm": 0.8904041051864624, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7886, "tokens_per_second_per_gpu": 9828.87, "total_tokens": 778757871 }, { "epoch": 0.4930607651912978, "grad_norm": 0.8618553876876831, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7887, "tokens_per_second_per_gpu": 10907.78, "total_tokens": 778859795 }, { "epoch": 0.49312328082020507, "grad_norm": 0.8638177514076233, "learning_rate": 2e-05, "loss": 0.6489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7888, "tokens_per_second_per_gpu": 10934.57, "total_tokens": 778960933 }, { "epoch": 0.49318579644911226, "grad_norm": 0.8726286292076111, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7889, "tokens_per_second_per_gpu": 10822.03, "total_tokens": 779061783 }, { "epoch": 0.4932483120780195, "grad_norm": 0.9418856501579285, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7890, "tokens_per_second_per_gpu": 11358.11, "total_tokens": 779164403 }, { "epoch": 0.49331082770692675, "grad_norm": 0.8931743502616882, "learning_rate": 2e-05, "loss": 0.6714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7891, "tokens_per_second_per_gpu": 10138.38, "total_tokens": 779262819 }, { "epoch": 0.49337334333583394, "grad_norm": 0.898102879524231, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7892, "tokens_per_second_per_gpu": 10397.63, "total_tokens": 779358124 }, { "epoch": 0.4934358589647412, "grad_norm": 0.9206399917602539, "learning_rate": 2e-05, "loss": 0.5845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7893, "tokens_per_second_per_gpu": 9625.66, "total_tokens": 779447214 }, { "epoch": 0.4934983745936484, "grad_norm": 0.9038742184638977, "learning_rate": 2e-05, "loss": 0.6975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7894, "tokens_per_second_per_gpu": 10527.67, "total_tokens": 779547444 }, { "epoch": 0.4935608902225556, "grad_norm": 0.8970116376876831, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7895, "tokens_per_second_per_gpu": 10730.51, "total_tokens": 779647137 }, { "epoch": 0.49362340585146286, "grad_norm": 0.9208768606185913, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7896, "tokens_per_second_per_gpu": 10340.27, "total_tokens": 779747379 }, { "epoch": 0.4936859214803701, "grad_norm": 0.9352499842643738, "learning_rate": 2e-05, "loss": 0.6596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7897, "tokens_per_second_per_gpu": 10543.06, "total_tokens": 779845265 }, { "epoch": 0.4937484371092773, "grad_norm": 0.9037752747535706, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7898, "tokens_per_second_per_gpu": 10493.07, "total_tokens": 779944450 }, { "epoch": 0.49381095273818454, "grad_norm": 0.851396918296814, "learning_rate": 2e-05, "loss": 0.6702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7899, "tokens_per_second_per_gpu": 10732.02, "total_tokens": 780047484 }, { "epoch": 0.4938734683670918, "grad_norm": 0.8667486906051636, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7900, "tokens_per_second_per_gpu": 10816.99, "total_tokens": 780147502 }, { "epoch": 0.493935983995999, "grad_norm": 0.8736631274223328, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7901, "tokens_per_second_per_gpu": 10173.6, "total_tokens": 780243394 }, { "epoch": 0.4939984996249062, "grad_norm": 0.9122177362442017, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7902, "tokens_per_second_per_gpu": 9843.45, "total_tokens": 780339802 }, { "epoch": 0.49406101525381346, "grad_norm": 0.8651015758514404, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7903, "tokens_per_second_per_gpu": 10854.32, "total_tokens": 780441165 }, { "epoch": 0.4941235308827207, "grad_norm": 0.8804681897163391, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7904, "tokens_per_second_per_gpu": 9711.58, "total_tokens": 780535882 }, { "epoch": 0.4941860465116279, "grad_norm": 0.9149863123893738, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7905, "tokens_per_second_per_gpu": 10428.92, "total_tokens": 780635890 }, { "epoch": 0.49424856214053514, "grad_norm": 0.8687833547592163, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7906, "tokens_per_second_per_gpu": 10124.32, "total_tokens": 780733478 }, { "epoch": 0.4943110777694424, "grad_norm": 0.8643891215324402, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7907, "tokens_per_second_per_gpu": 10939.87, "total_tokens": 780833248 }, { "epoch": 0.4943735933983496, "grad_norm": 0.8381507992744446, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7908, "tokens_per_second_per_gpu": 10532.13, "total_tokens": 780932614 }, { "epoch": 0.4944361090272568, "grad_norm": 0.8916929364204407, "learning_rate": 2e-05, "loss": 0.6677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7909, "tokens_per_second_per_gpu": 11353.59, "total_tokens": 781034625 }, { "epoch": 0.49449862465616407, "grad_norm": 0.9184936285018921, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7910, "tokens_per_second_per_gpu": 10571.81, "total_tokens": 781133734 }, { "epoch": 0.49456114028507125, "grad_norm": 0.8592957854270935, "learning_rate": 2e-05, "loss": 0.6048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7911, "tokens_per_second_per_gpu": 10516.28, "total_tokens": 781231672 }, { "epoch": 0.4946236559139785, "grad_norm": 0.8913319706916809, "learning_rate": 2e-05, "loss": 0.6836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7912, "tokens_per_second_per_gpu": 10737.95, "total_tokens": 781335178 }, { "epoch": 0.49468617154288574, "grad_norm": 0.8681648373603821, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7913, "tokens_per_second_per_gpu": 10537.34, "total_tokens": 781434301 }, { "epoch": 0.49474868717179293, "grad_norm": 0.9026208519935608, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7914, "tokens_per_second_per_gpu": 9888.38, "total_tokens": 781528009 }, { "epoch": 0.4948112028007002, "grad_norm": 0.8977193832397461, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7915, "tokens_per_second_per_gpu": 10596.89, "total_tokens": 781629817 }, { "epoch": 0.4948737184296074, "grad_norm": 0.9129910469055176, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7916, "tokens_per_second_per_gpu": 10820.57, "total_tokens": 781730284 }, { "epoch": 0.4949362340585146, "grad_norm": 0.9438385963439941, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7917, "tokens_per_second_per_gpu": 10784.34, "total_tokens": 781823145 }, { "epoch": 0.49499874968742186, "grad_norm": 0.8661107420921326, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7918, "tokens_per_second_per_gpu": 10250.06, "total_tokens": 781922241 }, { "epoch": 0.4950612653163291, "grad_norm": 0.9145733714103699, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7919, "tokens_per_second_per_gpu": 10089.01, "total_tokens": 782019711 }, { "epoch": 0.4951237809452363, "grad_norm": 0.8629605770111084, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7920, "tokens_per_second_per_gpu": 10992.67, "total_tokens": 782116101 }, { "epoch": 0.49518629657414354, "grad_norm": 0.8736943006515503, "learning_rate": 2e-05, "loss": 0.6848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7921, "tokens_per_second_per_gpu": 10897.67, "total_tokens": 782221884 }, { "epoch": 0.4952488122030508, "grad_norm": 0.9002954363822937, "learning_rate": 2e-05, "loss": 0.7062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7922, "tokens_per_second_per_gpu": 10729.35, "total_tokens": 782322566 }, { "epoch": 0.49531132783195797, "grad_norm": 0.8871166110038757, "learning_rate": 2e-05, "loss": 0.6549, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7923, "tokens_per_second_per_gpu": 11058.81, "total_tokens": 782426480 }, { "epoch": 0.4953738434608652, "grad_norm": 0.9259904026985168, "learning_rate": 2e-05, "loss": 0.6814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7924, "tokens_per_second_per_gpu": 10424.07, "total_tokens": 782524230 }, { "epoch": 0.49543635908977246, "grad_norm": 0.8904109597206116, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7925, "tokens_per_second_per_gpu": 10529.83, "total_tokens": 782625806 }, { "epoch": 0.49549887471867965, "grad_norm": 0.9089903235435486, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7926, "tokens_per_second_per_gpu": 11100.57, "total_tokens": 782726973 }, { "epoch": 0.4955613903475869, "grad_norm": 0.8792800903320312, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7927, "tokens_per_second_per_gpu": 10422.25, "total_tokens": 782824514 }, { "epoch": 0.49562390597649414, "grad_norm": 0.914936900138855, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7928, "tokens_per_second_per_gpu": 10505.68, "total_tokens": 782922962 }, { "epoch": 0.49568642160540133, "grad_norm": 0.9071531295776367, "learning_rate": 2e-05, "loss": 0.6457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7929, "tokens_per_second_per_gpu": 10684.28, "total_tokens": 783021373 }, { "epoch": 0.4957489372343086, "grad_norm": 0.8893082141876221, "learning_rate": 2e-05, "loss": 0.6496, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7930, "tokens_per_second_per_gpu": 10941.92, "total_tokens": 783121886 }, { "epoch": 0.4958114528632158, "grad_norm": 0.8731276392936707, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7931, "tokens_per_second_per_gpu": 10144.27, "total_tokens": 783223010 }, { "epoch": 0.495873968492123, "grad_norm": 0.8895136713981628, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7932, "tokens_per_second_per_gpu": 9853.39, "total_tokens": 783320190 }, { "epoch": 0.49593648412103025, "grad_norm": 0.8881374597549438, "learning_rate": 2e-05, "loss": 0.5718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7933, "tokens_per_second_per_gpu": 10248.92, "total_tokens": 783416305 }, { "epoch": 0.4959989997499375, "grad_norm": 0.9211751818656921, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7934, "tokens_per_second_per_gpu": 11300.1, "total_tokens": 783517331 }, { "epoch": 0.4960615153788447, "grad_norm": 0.89797043800354, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7935, "tokens_per_second_per_gpu": 10016.37, "total_tokens": 783615543 }, { "epoch": 0.49612403100775193, "grad_norm": 0.8988296389579773, "learning_rate": 2e-05, "loss": 0.6544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7936, "tokens_per_second_per_gpu": 11140.53, "total_tokens": 783716501 }, { "epoch": 0.4961865466366592, "grad_norm": 0.920091986656189, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7937, "tokens_per_second_per_gpu": 10394.56, "total_tokens": 783815605 }, { "epoch": 0.49624906226556637, "grad_norm": 0.9000912308692932, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7938, "tokens_per_second_per_gpu": 10571.28, "total_tokens": 783915422 }, { "epoch": 0.4963115778944736, "grad_norm": 0.8714137077331543, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7939, "tokens_per_second_per_gpu": 11297.08, "total_tokens": 784017587 }, { "epoch": 0.49637409352338085, "grad_norm": 0.8720464706420898, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7940, "tokens_per_second_per_gpu": 10425.91, "total_tokens": 784120005 }, { "epoch": 0.4964366091522881, "grad_norm": 0.8867588043212891, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7941, "tokens_per_second_per_gpu": 10724.54, "total_tokens": 784219868 }, { "epoch": 0.4964991247811953, "grad_norm": 0.8573328256607056, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7942, "tokens_per_second_per_gpu": 10338.34, "total_tokens": 784322478 }, { "epoch": 0.49656164041010253, "grad_norm": 0.8722565174102783, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7943, "tokens_per_second_per_gpu": 10520.52, "total_tokens": 784421503 }, { "epoch": 0.4966241560390098, "grad_norm": 0.910825252532959, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7944, "tokens_per_second_per_gpu": 10118.96, "total_tokens": 784523028 }, { "epoch": 0.49668667166791697, "grad_norm": 0.8965118527412415, "learning_rate": 2e-05, "loss": 0.6604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7945, "tokens_per_second_per_gpu": 11110.38, "total_tokens": 784623988 }, { "epoch": 0.4967491872968242, "grad_norm": 0.9009377956390381, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7946, "tokens_per_second_per_gpu": 10649.77, "total_tokens": 784723874 }, { "epoch": 0.49681170292573146, "grad_norm": 0.8929465413093567, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7947, "tokens_per_second_per_gpu": 10943.53, "total_tokens": 784824869 }, { "epoch": 0.49687421855463865, "grad_norm": 0.8794127106666565, "learning_rate": 2e-05, "loss": 0.6499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7948, "tokens_per_second_per_gpu": 10836.46, "total_tokens": 784924382 }, { "epoch": 0.4969367341835459, "grad_norm": 0.8839834332466125, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7949, "tokens_per_second_per_gpu": 10144.21, "total_tokens": 785023351 }, { "epoch": 0.49699924981245314, "grad_norm": 0.9004620909690857, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7950, "tokens_per_second_per_gpu": 10179.68, "total_tokens": 785120538 }, { "epoch": 0.4970617654413603, "grad_norm": 0.9722713232040405, "learning_rate": 2e-05, "loss": 0.6684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7951, "tokens_per_second_per_gpu": 10912.54, "total_tokens": 785218679 }, { "epoch": 0.49712428107026757, "grad_norm": 0.9139745235443115, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7952, "tokens_per_second_per_gpu": 10328.38, "total_tokens": 785315184 }, { "epoch": 0.4971867966991748, "grad_norm": 0.8776975274085999, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7953, "tokens_per_second_per_gpu": 10576.99, "total_tokens": 785414764 }, { "epoch": 0.497249312328082, "grad_norm": 0.895776629447937, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7954, "tokens_per_second_per_gpu": 10559.36, "total_tokens": 785511372 }, { "epoch": 0.49731182795698925, "grad_norm": 0.8970997929573059, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7955, "tokens_per_second_per_gpu": 10829.41, "total_tokens": 785613076 }, { "epoch": 0.4973743435858965, "grad_norm": 0.9448768496513367, "learning_rate": 2e-05, "loss": 0.6944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7956, "tokens_per_second_per_gpu": 10703.75, "total_tokens": 785711729 }, { "epoch": 0.4974368592148037, "grad_norm": 0.9164490103721619, "learning_rate": 2e-05, "loss": 0.6728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7957, "tokens_per_second_per_gpu": 10376.94, "total_tokens": 785811091 }, { "epoch": 0.49749937484371093, "grad_norm": 0.9802147746086121, "learning_rate": 2e-05, "loss": 0.6854, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7958, "tokens_per_second_per_gpu": 10866.15, "total_tokens": 785911136 }, { "epoch": 0.4975618904726182, "grad_norm": 0.8973126411437988, "learning_rate": 2e-05, "loss": 0.6502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7959, "tokens_per_second_per_gpu": 11349.32, "total_tokens": 786010333 }, { "epoch": 0.49762440610152536, "grad_norm": 0.9115820527076721, "learning_rate": 2e-05, "loss": 0.674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7960, "tokens_per_second_per_gpu": 10954.94, "total_tokens": 786114329 }, { "epoch": 0.4976869217304326, "grad_norm": 0.8892711400985718, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7961, "tokens_per_second_per_gpu": 10884.63, "total_tokens": 786215806 }, { "epoch": 0.49774943735933985, "grad_norm": 0.8772761821746826, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7962, "tokens_per_second_per_gpu": 10192.47, "total_tokens": 786315353 }, { "epoch": 0.49781195298824704, "grad_norm": 0.9108219742774963, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7963, "tokens_per_second_per_gpu": 9617.24, "total_tokens": 786405865 }, { "epoch": 0.4978744686171543, "grad_norm": 0.8951548933982849, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7964, "tokens_per_second_per_gpu": 9924.53, "total_tokens": 786501327 }, { "epoch": 0.49793698424606153, "grad_norm": 0.8903903961181641, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7965, "tokens_per_second_per_gpu": 10424.09, "total_tokens": 786597952 }, { "epoch": 0.4979994998749687, "grad_norm": 0.8830782771110535, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7966, "tokens_per_second_per_gpu": 10286.82, "total_tokens": 786694772 }, { "epoch": 0.49806201550387597, "grad_norm": 0.8765531182289124, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7967, "tokens_per_second_per_gpu": 10224.79, "total_tokens": 786792540 }, { "epoch": 0.4981245311327832, "grad_norm": 0.9251441955566406, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7968, "tokens_per_second_per_gpu": 9370.97, "total_tokens": 786884598 }, { "epoch": 0.4981870467616904, "grad_norm": 0.9000948667526245, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7969, "tokens_per_second_per_gpu": 10288.0, "total_tokens": 786980660 }, { "epoch": 0.49824956239059764, "grad_norm": 0.9058273434638977, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7970, "tokens_per_second_per_gpu": 10856.08, "total_tokens": 787080317 }, { "epoch": 0.4983120780195049, "grad_norm": 0.8515642285346985, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7971, "tokens_per_second_per_gpu": 10675.01, "total_tokens": 787181749 }, { "epoch": 0.4983745936484121, "grad_norm": 0.9002155065536499, "learning_rate": 2e-05, "loss": 0.6627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7972, "tokens_per_second_per_gpu": 10947.3, "total_tokens": 787275866 }, { "epoch": 0.4984371092773193, "grad_norm": 0.8876636028289795, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7973, "tokens_per_second_per_gpu": 9737.75, "total_tokens": 787367201 }, { "epoch": 0.49849962490622657, "grad_norm": 0.9781882166862488, "learning_rate": 2e-05, "loss": 0.6903, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7974, "tokens_per_second_per_gpu": 9959.55, "total_tokens": 787464618 }, { "epoch": 0.49856214053513376, "grad_norm": 0.8908398747444153, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7975, "tokens_per_second_per_gpu": 10004.43, "total_tokens": 787559941 }, { "epoch": 0.498624656164041, "grad_norm": 0.9012528657913208, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7976, "tokens_per_second_per_gpu": 10900.48, "total_tokens": 787660761 }, { "epoch": 0.49868717179294825, "grad_norm": 0.9223092794418335, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7977, "tokens_per_second_per_gpu": 10454.2, "total_tokens": 787757894 }, { "epoch": 0.49874968742185544, "grad_norm": 0.8316749930381775, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7978, "tokens_per_second_per_gpu": 11244.19, "total_tokens": 787860353 }, { "epoch": 0.4988122030507627, "grad_norm": 0.9062113761901855, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7979, "tokens_per_second_per_gpu": 10365.74, "total_tokens": 787961081 }, { "epoch": 0.4988747186796699, "grad_norm": 0.8729040622711182, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7980, "tokens_per_second_per_gpu": 10435.54, "total_tokens": 788061846 }, { "epoch": 0.49893723430857717, "grad_norm": 0.9269327521324158, "learning_rate": 2e-05, "loss": 0.6874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7981, "tokens_per_second_per_gpu": 10828.83, "total_tokens": 788161809 }, { "epoch": 0.49899974993748436, "grad_norm": 0.9064761996269226, "learning_rate": 2e-05, "loss": 0.6423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7982, "tokens_per_second_per_gpu": 10158.32, "total_tokens": 788257943 }, { "epoch": 0.4990622655663916, "grad_norm": 0.8658515214920044, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7983, "tokens_per_second_per_gpu": 11256.5, "total_tokens": 788358748 }, { "epoch": 0.49912478119529885, "grad_norm": 0.8813115358352661, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7984, "tokens_per_second_per_gpu": 10762.73, "total_tokens": 788458968 }, { "epoch": 0.49918729682420604, "grad_norm": 0.8759530782699585, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7985, "tokens_per_second_per_gpu": 10601.53, "total_tokens": 788559906 }, { "epoch": 0.4992498124531133, "grad_norm": 0.869143009185791, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7986, "tokens_per_second_per_gpu": 10161.85, "total_tokens": 788660128 }, { "epoch": 0.49931232808202053, "grad_norm": 0.8823542594909668, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7987, "tokens_per_second_per_gpu": 9972.88, "total_tokens": 788762194 }, { "epoch": 0.4993748437109277, "grad_norm": 0.8877498507499695, "learning_rate": 2e-05, "loss": 0.6036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7988, "tokens_per_second_per_gpu": 10416.72, "total_tokens": 788852922 }, { "epoch": 0.49943735933983496, "grad_norm": 0.891032338142395, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7989, "tokens_per_second_per_gpu": 10692.44, "total_tokens": 788950763 }, { "epoch": 0.4994998749687422, "grad_norm": 0.8820189833641052, "learning_rate": 2e-05, "loss": 0.6105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7990, "tokens_per_second_per_gpu": 10074.13, "total_tokens": 789046708 }, { "epoch": 0.4995623905976494, "grad_norm": 0.897924542427063, "learning_rate": 2e-05, "loss": 0.5959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7991, "tokens_per_second_per_gpu": 9944.41, "total_tokens": 789138690 }, { "epoch": 0.49962490622655664, "grad_norm": 0.9029802680015564, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7992, "tokens_per_second_per_gpu": 10837.98, "total_tokens": 789240472 }, { "epoch": 0.4996874218554639, "grad_norm": 0.8696165680885315, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7993, "tokens_per_second_per_gpu": 9599.78, "total_tokens": 789338373 }, { "epoch": 0.4997499374843711, "grad_norm": 0.8624607920646667, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7994, "tokens_per_second_per_gpu": 10988.54, "total_tokens": 789441689 }, { "epoch": 0.4998124531132783, "grad_norm": 0.8565172553062439, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7995, "tokens_per_second_per_gpu": 11394.59, "total_tokens": 789546578 }, { "epoch": 0.49987496874218557, "grad_norm": 0.8732147216796875, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7996, "tokens_per_second_per_gpu": 10457.01, "total_tokens": 789644641 }, { "epoch": 0.49993748437109276, "grad_norm": 0.9169918894767761, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7997, "tokens_per_second_per_gpu": 10604.67, "total_tokens": 789743316 }, { "epoch": 0.5, "grad_norm": 0.8750367760658264, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7998, "tokens_per_second_per_gpu": 10592.45, "total_tokens": 789840112 }, { "epoch": 0.5000625156289072, "grad_norm": 0.9466283321380615, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 7999, "tokens_per_second_per_gpu": 10903.97, "total_tokens": 789936692 }, { "epoch": 0.5001250312578145, "grad_norm": 0.8634164929389954, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8000, "tokens_per_second_per_gpu": 9929.54, "total_tokens": 790032918 }, { "epoch": 0.5001875468867217, "grad_norm": 0.8717974424362183, "learning_rate": 2e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8001, "tokens_per_second_per_gpu": 11776.71, "total_tokens": 790138760 }, { "epoch": 0.5002500625156289, "grad_norm": 0.8687346577644348, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8002, "tokens_per_second_per_gpu": 11087.93, "total_tokens": 790236132 }, { "epoch": 0.5003125781445361, "grad_norm": 0.8880992531776428, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8003, "tokens_per_second_per_gpu": 11150.05, "total_tokens": 790337386 }, { "epoch": 0.5003750937734434, "grad_norm": 0.9009386301040649, "learning_rate": 2e-05, "loss": 0.6745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8004, "tokens_per_second_per_gpu": 10930.89, "total_tokens": 790441359 }, { "epoch": 0.5004376094023506, "grad_norm": 0.8846046924591064, "learning_rate": 2e-05, "loss": 0.6773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8005, "tokens_per_second_per_gpu": 10924.7, "total_tokens": 790542718 }, { "epoch": 0.5005001250312578, "grad_norm": 0.8961657285690308, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8006, "tokens_per_second_per_gpu": 9923.83, "total_tokens": 790640288 }, { "epoch": 0.5005626406601651, "grad_norm": 0.8956014513969421, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8007, "tokens_per_second_per_gpu": 11545.67, "total_tokens": 790742564 }, { "epoch": 0.5006251562890722, "grad_norm": 0.8496471047401428, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8008, "tokens_per_second_per_gpu": 11368.91, "total_tokens": 790844909 }, { "epoch": 0.5006876719179795, "grad_norm": 0.8700152635574341, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8009, "tokens_per_second_per_gpu": 10728.23, "total_tokens": 790946620 }, { "epoch": 0.5007501875468867, "grad_norm": 0.8917396068572998, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8010, "tokens_per_second_per_gpu": 11123.1, "total_tokens": 791050288 }, { "epoch": 0.500812703175794, "grad_norm": 0.8675493001937866, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8011, "tokens_per_second_per_gpu": 11247.98, "total_tokens": 791153549 }, { "epoch": 0.5008752188047012, "grad_norm": 0.9340028166770935, "learning_rate": 2e-05, "loss": 0.6791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8012, "tokens_per_second_per_gpu": 10479.43, "total_tokens": 791254967 }, { "epoch": 0.5009377344336085, "grad_norm": 0.8868899941444397, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8013, "tokens_per_second_per_gpu": 10387.65, "total_tokens": 791353890 }, { "epoch": 0.5010002500625156, "grad_norm": 1.0230504274368286, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8014, "tokens_per_second_per_gpu": 9388.15, "total_tokens": 791447602 }, { "epoch": 0.5010627656914228, "grad_norm": 0.8989652395248413, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8015, "tokens_per_second_per_gpu": 10756.09, "total_tokens": 791548070 }, { "epoch": 0.5011252813203301, "grad_norm": 0.8761565089225769, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8016, "tokens_per_second_per_gpu": 10804.58, "total_tokens": 791650450 }, { "epoch": 0.5011877969492373, "grad_norm": 0.8922309279441833, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8017, "tokens_per_second_per_gpu": 10076.25, "total_tokens": 791746348 }, { "epoch": 0.5012503125781446, "grad_norm": 0.9109164476394653, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8018, "tokens_per_second_per_gpu": 10161.38, "total_tokens": 791840497 }, { "epoch": 0.5013128282070518, "grad_norm": 0.8557631969451904, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8019, "tokens_per_second_per_gpu": 10827.35, "total_tokens": 791942766 }, { "epoch": 0.5013753438359589, "grad_norm": 0.9070133566856384, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8020, "tokens_per_second_per_gpu": 9303.96, "total_tokens": 792035994 }, { "epoch": 0.5014378594648662, "grad_norm": 0.8635545969009399, "learning_rate": 2e-05, "loss": 0.6483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8021, "tokens_per_second_per_gpu": 10390.47, "total_tokens": 792139109 }, { "epoch": 0.5015003750937734, "grad_norm": 0.8760800957679749, "learning_rate": 2e-05, "loss": 0.6428, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8022, "tokens_per_second_per_gpu": 11236.88, "total_tokens": 792241944 }, { "epoch": 0.5015628907226807, "grad_norm": 0.8920235633850098, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8023, "tokens_per_second_per_gpu": 10662.22, "total_tokens": 792336854 }, { "epoch": 0.5016254063515879, "grad_norm": 0.8851656913757324, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8024, "tokens_per_second_per_gpu": 10516.4, "total_tokens": 792434277 }, { "epoch": 0.5016879219804952, "grad_norm": 0.8775529265403748, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8025, "tokens_per_second_per_gpu": 10301.2, "total_tokens": 792533476 }, { "epoch": 0.5017504376094023, "grad_norm": 0.8974536061286926, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8026, "tokens_per_second_per_gpu": 10722.15, "total_tokens": 792632583 }, { "epoch": 0.5018129532383095, "grad_norm": 1.060132384300232, "learning_rate": 2e-05, "loss": 0.6975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8027, "tokens_per_second_per_gpu": 10092.29, "total_tokens": 792728615 }, { "epoch": 0.5018754688672168, "grad_norm": 0.8490462899208069, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8028, "tokens_per_second_per_gpu": 11187.53, "total_tokens": 792828225 }, { "epoch": 0.501937984496124, "grad_norm": 0.8471887707710266, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8029, "tokens_per_second_per_gpu": 10747.16, "total_tokens": 792931102 }, { "epoch": 0.5020005001250313, "grad_norm": 0.8910149335861206, "learning_rate": 2e-05, "loss": 0.6297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8030, "tokens_per_second_per_gpu": 9933.16, "total_tokens": 793026237 }, { "epoch": 0.5020630157539385, "grad_norm": 0.8755080103874207, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8031, "tokens_per_second_per_gpu": 10423.61, "total_tokens": 793127138 }, { "epoch": 0.5021255313828457, "grad_norm": 0.8645676374435425, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8032, "tokens_per_second_per_gpu": 10818.35, "total_tokens": 793224739 }, { "epoch": 0.5021880470117529, "grad_norm": 0.9118263125419617, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8033, "tokens_per_second_per_gpu": 10612.85, "total_tokens": 793325740 }, { "epoch": 0.5022505626406601, "grad_norm": 0.9053899049758911, "learning_rate": 2e-05, "loss": 0.6869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8034, "tokens_per_second_per_gpu": 10399.94, "total_tokens": 793426317 }, { "epoch": 0.5023130782695674, "grad_norm": 0.8646851181983948, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8035, "tokens_per_second_per_gpu": 10455.57, "total_tokens": 793520961 }, { "epoch": 0.5023755938984746, "grad_norm": 0.8773461580276489, "learning_rate": 2e-05, "loss": 0.5797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8036, "tokens_per_second_per_gpu": 10020.92, "total_tokens": 793616054 }, { "epoch": 0.5024381095273819, "grad_norm": 0.8802333474159241, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8037, "tokens_per_second_per_gpu": 10566.56, "total_tokens": 793715937 }, { "epoch": 0.5025006251562891, "grad_norm": 0.9093921780586243, "learning_rate": 2e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8038, "tokens_per_second_per_gpu": 10668.79, "total_tokens": 793818203 }, { "epoch": 0.5025631407851963, "grad_norm": 0.9133094549179077, "learning_rate": 2e-05, "loss": 0.6644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8039, "tokens_per_second_per_gpu": 10294.43, "total_tokens": 793920020 }, { "epoch": 0.5026256564141035, "grad_norm": 0.8464044332504272, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8040, "tokens_per_second_per_gpu": 10585.92, "total_tokens": 794022497 }, { "epoch": 0.5026881720430108, "grad_norm": 0.914683997631073, "learning_rate": 2e-05, "loss": 0.6636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8041, "tokens_per_second_per_gpu": 10465.89, "total_tokens": 794121916 }, { "epoch": 0.502750687671918, "grad_norm": 0.8682564496994019, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8042, "tokens_per_second_per_gpu": 10648.47, "total_tokens": 794219993 }, { "epoch": 0.5028132033008252, "grad_norm": 0.8715588450431824, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8043, "tokens_per_second_per_gpu": 11005.63, "total_tokens": 794318884 }, { "epoch": 0.5028757189297325, "grad_norm": 0.9156914353370667, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8044, "tokens_per_second_per_gpu": 10985.88, "total_tokens": 794418194 }, { "epoch": 0.5029382345586396, "grad_norm": 0.8979815244674683, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8045, "tokens_per_second_per_gpu": 10343.72, "total_tokens": 794516204 }, { "epoch": 0.5030007501875469, "grad_norm": 0.8395136594772339, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8046, "tokens_per_second_per_gpu": 10677.36, "total_tokens": 794616045 }, { "epoch": 0.5030632658164541, "grad_norm": 0.853118360042572, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8047, "tokens_per_second_per_gpu": 9949.2, "total_tokens": 794714881 }, { "epoch": 0.5031257814453614, "grad_norm": 0.8720948696136475, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8048, "tokens_per_second_per_gpu": 11140.89, "total_tokens": 794811952 }, { "epoch": 0.5031882970742686, "grad_norm": 0.8940891027450562, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8049, "tokens_per_second_per_gpu": 10330.2, "total_tokens": 794909985 }, { "epoch": 0.5032508127031758, "grad_norm": 0.8782552480697632, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8050, "tokens_per_second_per_gpu": 10887.09, "total_tokens": 795005875 }, { "epoch": 0.503313328332083, "grad_norm": 0.8401991724967957, "learning_rate": 2e-05, "loss": 0.6432, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8051, "tokens_per_second_per_gpu": 10949.79, "total_tokens": 795105376 }, { "epoch": 0.5033758439609902, "grad_norm": 0.9080361127853394, "learning_rate": 2e-05, "loss": 0.5854, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8052, "tokens_per_second_per_gpu": 9974.92, "total_tokens": 795199016 }, { "epoch": 0.5034383595898975, "grad_norm": 0.8772081136703491, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8053, "tokens_per_second_per_gpu": 10685.01, "total_tokens": 795298280 }, { "epoch": 0.5035008752188047, "grad_norm": 0.8681245446205139, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8054, "tokens_per_second_per_gpu": 10539.14, "total_tokens": 795398977 }, { "epoch": 0.503563390847712, "grad_norm": 0.8987775444984436, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8055, "tokens_per_second_per_gpu": 10407.74, "total_tokens": 795498268 }, { "epoch": 0.5036259064766192, "grad_norm": 0.8445727229118347, "learning_rate": 2e-05, "loss": 0.5957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8056, "tokens_per_second_per_gpu": 10978.55, "total_tokens": 795596763 }, { "epoch": 0.5036884221055263, "grad_norm": 0.9168239235877991, "learning_rate": 2e-05, "loss": 0.6627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8057, "tokens_per_second_per_gpu": 10750.11, "total_tokens": 795696844 }, { "epoch": 0.5037509377344336, "grad_norm": 0.8676276803016663, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8058, "tokens_per_second_per_gpu": 10552.35, "total_tokens": 795793764 }, { "epoch": 0.5038134533633408, "grad_norm": 0.8893340229988098, "learning_rate": 2e-05, "loss": 0.6639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8059, "tokens_per_second_per_gpu": 11472.69, "total_tokens": 795893778 }, { "epoch": 0.5038759689922481, "grad_norm": 0.879403293132782, "learning_rate": 2e-05, "loss": 0.6955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8060, "tokens_per_second_per_gpu": 10505.17, "total_tokens": 795992796 }, { "epoch": 0.5039384846211553, "grad_norm": 0.878896951675415, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8061, "tokens_per_second_per_gpu": 11066.99, "total_tokens": 796094649 }, { "epoch": 0.5040010002500626, "grad_norm": 0.9127047657966614, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8062, "tokens_per_second_per_gpu": 10601.89, "total_tokens": 796188937 }, { "epoch": 0.5040635158789697, "grad_norm": 0.9028503894805908, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8063, "tokens_per_second_per_gpu": 8979.42, "total_tokens": 796281204 }, { "epoch": 0.5041260315078769, "grad_norm": 0.9128813147544861, "learning_rate": 2e-05, "loss": 0.7028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8064, "tokens_per_second_per_gpu": 9971.1, "total_tokens": 796381666 }, { "epoch": 0.5041885471367842, "grad_norm": 0.8589302897453308, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8065, "tokens_per_second_per_gpu": 10900.7, "total_tokens": 796482365 }, { "epoch": 0.5042510627656914, "grad_norm": 0.8445535898208618, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8066, "tokens_per_second_per_gpu": 10948.6, "total_tokens": 796581914 }, { "epoch": 0.5043135783945987, "grad_norm": 0.8871612548828125, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8067, "tokens_per_second_per_gpu": 9587.38, "total_tokens": 796674934 }, { "epoch": 0.5043760940235059, "grad_norm": 0.9041314721107483, "learning_rate": 2e-05, "loss": 0.635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8068, "tokens_per_second_per_gpu": 10718.81, "total_tokens": 796775577 }, { "epoch": 0.504438609652413, "grad_norm": 0.8977795839309692, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8069, "tokens_per_second_per_gpu": 9612.3, "total_tokens": 796872064 }, { "epoch": 0.5045011252813203, "grad_norm": 0.8912869095802307, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8070, "tokens_per_second_per_gpu": 9725.12, "total_tokens": 796971083 }, { "epoch": 0.5045636409102275, "grad_norm": 0.8770602941513062, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8071, "tokens_per_second_per_gpu": 10695.81, "total_tokens": 797074252 }, { "epoch": 0.5046261565391348, "grad_norm": 0.8528137803077698, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8072, "tokens_per_second_per_gpu": 16082.28, "total_tokens": 797176574 }, { "epoch": 0.504688672168042, "grad_norm": 0.8660191893577576, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8073, "tokens_per_second_per_gpu": 10226.83, "total_tokens": 797272646 }, { "epoch": 0.5047511877969493, "grad_norm": 0.8880550265312195, "learning_rate": 2e-05, "loss": 0.6735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8074, "tokens_per_second_per_gpu": 10020.48, "total_tokens": 797370566 }, { "epoch": 0.5048137034258565, "grad_norm": 0.8945627212524414, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8075, "tokens_per_second_per_gpu": 10717.05, "total_tokens": 797470989 }, { "epoch": 0.5048762190547637, "grad_norm": 0.8924139738082886, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8076, "tokens_per_second_per_gpu": 10889.54, "total_tokens": 797572805 }, { "epoch": 0.5049387346836709, "grad_norm": 0.8643787503242493, "learning_rate": 2e-05, "loss": 0.6059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8077, "tokens_per_second_per_gpu": 10611.42, "total_tokens": 797671649 }, { "epoch": 0.5050012503125781, "grad_norm": 0.8420688509941101, "learning_rate": 2e-05, "loss": 0.6037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8078, "tokens_per_second_per_gpu": 10839.48, "total_tokens": 797772084 }, { "epoch": 0.5050637659414854, "grad_norm": 0.8790666460990906, "learning_rate": 2e-05, "loss": 0.659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8079, "tokens_per_second_per_gpu": 10549.65, "total_tokens": 797873683 }, { "epoch": 0.5051262815703926, "grad_norm": 0.9030668139457703, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8080, "tokens_per_second_per_gpu": 10668.4, "total_tokens": 797971623 }, { "epoch": 0.5051887971992999, "grad_norm": 0.8766368627548218, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8081, "tokens_per_second_per_gpu": 10590.41, "total_tokens": 798071401 }, { "epoch": 0.505251312828207, "grad_norm": 0.9094419479370117, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8082, "tokens_per_second_per_gpu": 10527.42, "total_tokens": 798174946 }, { "epoch": 0.5053138284571143, "grad_norm": 0.9022007584571838, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8083, "tokens_per_second_per_gpu": 10243.1, "total_tokens": 798274010 }, { "epoch": 0.5053763440860215, "grad_norm": 0.8595781922340393, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8084, "tokens_per_second_per_gpu": 9950.66, "total_tokens": 798374682 }, { "epoch": 0.5054388597149287, "grad_norm": 0.8719276785850525, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8085, "tokens_per_second_per_gpu": 9931.3, "total_tokens": 798471181 }, { "epoch": 0.505501375343836, "grad_norm": 0.893913984298706, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8086, "tokens_per_second_per_gpu": 9969.02, "total_tokens": 798567635 }, { "epoch": 0.5055638909727432, "grad_norm": 0.8789299726486206, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8087, "tokens_per_second_per_gpu": 10598.25, "total_tokens": 798669333 }, { "epoch": 0.5056264066016504, "grad_norm": 0.8567805886268616, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8088, "tokens_per_second_per_gpu": 10354.78, "total_tokens": 798770768 }, { "epoch": 0.5056889222305576, "grad_norm": 0.8940423727035522, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8089, "tokens_per_second_per_gpu": 9700.64, "total_tokens": 798866174 }, { "epoch": 0.5057514378594649, "grad_norm": 0.8236247301101685, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8090, "tokens_per_second_per_gpu": 10803.58, "total_tokens": 798971335 }, { "epoch": 0.5058139534883721, "grad_norm": 0.8827580213546753, "learning_rate": 2e-05, "loss": 0.6448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8091, "tokens_per_second_per_gpu": 10831.77, "total_tokens": 799068720 }, { "epoch": 0.5058764691172793, "grad_norm": 0.9040899276733398, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8092, "tokens_per_second_per_gpu": 10319.07, "total_tokens": 799164950 }, { "epoch": 0.5059389847461866, "grad_norm": 0.955416738986969, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8093, "tokens_per_second_per_gpu": 9286.98, "total_tokens": 799259285 }, { "epoch": 0.5060015003750937, "grad_norm": 0.962685227394104, "learning_rate": 2e-05, "loss": 0.6682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8094, "tokens_per_second_per_gpu": 9924.38, "total_tokens": 799355084 }, { "epoch": 0.506064016004001, "grad_norm": 0.9040170907974243, "learning_rate": 2e-05, "loss": 0.6689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8095, "tokens_per_second_per_gpu": 10964.89, "total_tokens": 799455821 }, { "epoch": 0.5061265316329082, "grad_norm": 0.893290102481842, "learning_rate": 2e-05, "loss": 0.6908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8096, "tokens_per_second_per_gpu": 11277.33, "total_tokens": 799558293 }, { "epoch": 0.5061890472618155, "grad_norm": 0.9164308905601501, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8097, "tokens_per_second_per_gpu": 9836.42, "total_tokens": 799654283 }, { "epoch": 0.5062515628907227, "grad_norm": 0.8966354727745056, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8098, "tokens_per_second_per_gpu": 10337.87, "total_tokens": 799751398 }, { "epoch": 0.50631407851963, "grad_norm": 1.0120587348937988, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8099, "tokens_per_second_per_gpu": 11221.91, "total_tokens": 799852903 }, { "epoch": 0.5063765941485371, "grad_norm": 0.9138791561126709, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8100, "tokens_per_second_per_gpu": 10262.9, "total_tokens": 799947152 }, { "epoch": 0.5064391097774443, "grad_norm": 0.9274207353591919, "learning_rate": 2e-05, "loss": 0.6581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8101, "tokens_per_second_per_gpu": 10665.86, "total_tokens": 800047217 }, { "epoch": 0.5065016254063516, "grad_norm": 0.941498875617981, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8102, "tokens_per_second_per_gpu": 10975.74, "total_tokens": 800144681 }, { "epoch": 0.5065641410352588, "grad_norm": 0.8549249172210693, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8103, "tokens_per_second_per_gpu": 10136.04, "total_tokens": 800244035 }, { "epoch": 0.5066266566641661, "grad_norm": 0.901706337928772, "learning_rate": 2e-05, "loss": 0.6928, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8104, "tokens_per_second_per_gpu": 10780.52, "total_tokens": 800345741 }, { "epoch": 0.5066891722930733, "grad_norm": 0.9247812628746033, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8105, "tokens_per_second_per_gpu": 10127.11, "total_tokens": 800440202 }, { "epoch": 0.5067516879219804, "grad_norm": 0.9524098634719849, "learning_rate": 2e-05, "loss": 0.6701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8106, "tokens_per_second_per_gpu": 10782.98, "total_tokens": 800539179 }, { "epoch": 0.5068142035508877, "grad_norm": 0.893714427947998, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8107, "tokens_per_second_per_gpu": 10869.07, "total_tokens": 800642119 }, { "epoch": 0.5068767191797949, "grad_norm": 0.8744068741798401, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8108, "tokens_per_second_per_gpu": 10682.97, "total_tokens": 800742682 }, { "epoch": 0.5069392348087022, "grad_norm": 0.8954468369483948, "learning_rate": 2e-05, "loss": 0.648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8109, "tokens_per_second_per_gpu": 10294.6, "total_tokens": 800839972 }, { "epoch": 0.5070017504376094, "grad_norm": 0.8588884472846985, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8110, "tokens_per_second_per_gpu": 10872.33, "total_tokens": 800934756 }, { "epoch": 0.5070642660665167, "grad_norm": 0.9473593235015869, "learning_rate": 2e-05, "loss": 0.6356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8111, "tokens_per_second_per_gpu": 10068.97, "total_tokens": 801031640 }, { "epoch": 0.5071267816954238, "grad_norm": 0.882194459438324, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8112, "tokens_per_second_per_gpu": 10633.28, "total_tokens": 801132313 }, { "epoch": 0.507189297324331, "grad_norm": 0.9144291877746582, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8113, "tokens_per_second_per_gpu": 11138.7, "total_tokens": 801232049 }, { "epoch": 0.5072518129532383, "grad_norm": 0.9124865531921387, "learning_rate": 2e-05, "loss": 0.68, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8114, "tokens_per_second_per_gpu": 9942.33, "total_tokens": 801329667 }, { "epoch": 0.5073143285821455, "grad_norm": 0.9066067934036255, "learning_rate": 2e-05, "loss": 0.6317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8115, "tokens_per_second_per_gpu": 10197.59, "total_tokens": 801425897 }, { "epoch": 0.5073768442110528, "grad_norm": 0.927341639995575, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8116, "tokens_per_second_per_gpu": 10390.58, "total_tokens": 801525646 }, { "epoch": 0.50743935983996, "grad_norm": 0.8949296474456787, "learning_rate": 2e-05, "loss": 0.6514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8117, "tokens_per_second_per_gpu": 10345.88, "total_tokens": 801624964 }, { "epoch": 0.5075018754688673, "grad_norm": 0.8850945830345154, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8118, "tokens_per_second_per_gpu": 10807.97, "total_tokens": 801725265 }, { "epoch": 0.5075643910977744, "grad_norm": 0.9137868285179138, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8119, "tokens_per_second_per_gpu": 10057.86, "total_tokens": 801820390 }, { "epoch": 0.5076269067266816, "grad_norm": 0.9147606492042542, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8120, "tokens_per_second_per_gpu": 10232.13, "total_tokens": 801913568 }, { "epoch": 0.5076894223555889, "grad_norm": 0.8896816968917847, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8121, "tokens_per_second_per_gpu": 11000.27, "total_tokens": 802015744 }, { "epoch": 0.5077519379844961, "grad_norm": 0.9482116103172302, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8122, "tokens_per_second_per_gpu": 9596.25, "total_tokens": 802108041 }, { "epoch": 0.5078144536134034, "grad_norm": 0.9394980072975159, "learning_rate": 2e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8123, "tokens_per_second_per_gpu": 10147.97, "total_tokens": 802206626 }, { "epoch": 0.5078769692423106, "grad_norm": 0.9626311659812927, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8124, "tokens_per_second_per_gpu": 10071.18, "total_tokens": 802307286 }, { "epoch": 0.5079394848712178, "grad_norm": 0.9306063652038574, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8125, "tokens_per_second_per_gpu": 10314.49, "total_tokens": 802403831 }, { "epoch": 0.508002000500125, "grad_norm": 0.9116655588150024, "learning_rate": 2e-05, "loss": 0.6487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8126, "tokens_per_second_per_gpu": 10389.57, "total_tokens": 802505288 }, { "epoch": 0.5080645161290323, "grad_norm": 0.9166299104690552, "learning_rate": 2e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8127, "tokens_per_second_per_gpu": 10644.58, "total_tokens": 802604788 }, { "epoch": 0.5081270317579395, "grad_norm": 0.9186384081840515, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8128, "tokens_per_second_per_gpu": 10794.63, "total_tokens": 802704011 }, { "epoch": 0.5081895473868467, "grad_norm": 0.8614037036895752, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8129, "tokens_per_second_per_gpu": 11220.85, "total_tokens": 802806327 }, { "epoch": 0.508252063015754, "grad_norm": 0.9071249961853027, "learning_rate": 2e-05, "loss": 0.6618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8130, "tokens_per_second_per_gpu": 10175.62, "total_tokens": 802903060 }, { "epoch": 0.5083145786446611, "grad_norm": 0.9708498120307922, "learning_rate": 2e-05, "loss": 0.6785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8131, "tokens_per_second_per_gpu": 10518.22, "total_tokens": 802998656 }, { "epoch": 0.5083770942735684, "grad_norm": 0.8903947472572327, "learning_rate": 2e-05, "loss": 0.6744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8132, "tokens_per_second_per_gpu": 11117.19, "total_tokens": 803102065 }, { "epoch": 0.5084396099024756, "grad_norm": 0.9439392685890198, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8133, "tokens_per_second_per_gpu": 10450.18, "total_tokens": 803198050 }, { "epoch": 0.5085021255313829, "grad_norm": 0.8591237664222717, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8134, "tokens_per_second_per_gpu": 10253.92, "total_tokens": 803295828 }, { "epoch": 0.5085646411602901, "grad_norm": 0.912861704826355, "learning_rate": 2e-05, "loss": 0.6687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8135, "tokens_per_second_per_gpu": 10064.51, "total_tokens": 803392917 }, { "epoch": 0.5086271567891973, "grad_norm": 0.9107754230499268, "learning_rate": 2e-05, "loss": 0.6934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8136, "tokens_per_second_per_gpu": 10892.9, "total_tokens": 803495731 }, { "epoch": 0.5086896724181045, "grad_norm": 0.9271632432937622, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8137, "tokens_per_second_per_gpu": 10512.73, "total_tokens": 803594222 }, { "epoch": 0.5087521880470117, "grad_norm": 0.9043281674385071, "learning_rate": 2e-05, "loss": 0.6515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8138, "tokens_per_second_per_gpu": 10733.15, "total_tokens": 803693118 }, { "epoch": 0.508814703675919, "grad_norm": 0.9018848538398743, "learning_rate": 2e-05, "loss": 0.6426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8139, "tokens_per_second_per_gpu": 10815.76, "total_tokens": 803793818 }, { "epoch": 0.5088772193048262, "grad_norm": 0.8879655599594116, "learning_rate": 2e-05, "loss": 0.6766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8140, "tokens_per_second_per_gpu": 10805.81, "total_tokens": 803895138 }, { "epoch": 0.5089397349337335, "grad_norm": 0.8757271766662598, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8141, "tokens_per_second_per_gpu": 9843.36, "total_tokens": 803993835 }, { "epoch": 0.5090022505626407, "grad_norm": 0.9097186326980591, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8142, "tokens_per_second_per_gpu": 9408.99, "total_tokens": 804089903 }, { "epoch": 0.5090647661915478, "grad_norm": 0.8913153409957886, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8143, "tokens_per_second_per_gpu": 10410.18, "total_tokens": 804185305 }, { "epoch": 0.5091272818204551, "grad_norm": 0.9018717408180237, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8144, "tokens_per_second_per_gpu": 10117.09, "total_tokens": 804282714 }, { "epoch": 0.5091897974493623, "grad_norm": 0.8968683481216431, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8145, "tokens_per_second_per_gpu": 10608.98, "total_tokens": 804376763 }, { "epoch": 0.5092523130782696, "grad_norm": 0.884590744972229, "learning_rate": 2e-05, "loss": 0.5947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8146, "tokens_per_second_per_gpu": 9780.99, "total_tokens": 804470508 }, { "epoch": 0.5093148287071768, "grad_norm": 0.939268171787262, "learning_rate": 2e-05, "loss": 0.7043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8147, "tokens_per_second_per_gpu": 10458.94, "total_tokens": 804567730 }, { "epoch": 0.5093773443360841, "grad_norm": 0.9333909153938293, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8148, "tokens_per_second_per_gpu": 10656.06, "total_tokens": 804665018 }, { "epoch": 0.5094398599649912, "grad_norm": 0.8889108300209045, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8149, "tokens_per_second_per_gpu": 10444.7, "total_tokens": 804762582 }, { "epoch": 0.5095023755938984, "grad_norm": 0.8634263873100281, "learning_rate": 2e-05, "loss": 0.6623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8150, "tokens_per_second_per_gpu": 10609.88, "total_tokens": 804861659 }, { "epoch": 0.5095648912228057, "grad_norm": 0.9434363842010498, "learning_rate": 2e-05, "loss": 0.6517, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8151, "tokens_per_second_per_gpu": 10455.77, "total_tokens": 804958222 }, { "epoch": 0.5096274068517129, "grad_norm": 0.9033250212669373, "learning_rate": 2e-05, "loss": 0.6718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8152, "tokens_per_second_per_gpu": 10468.82, "total_tokens": 805057044 }, { "epoch": 0.5096899224806202, "grad_norm": 0.9051264524459839, "learning_rate": 2e-05, "loss": 0.6646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8153, "tokens_per_second_per_gpu": 10870.33, "total_tokens": 805159861 }, { "epoch": 0.5097524381095274, "grad_norm": 0.9047483801841736, "learning_rate": 2e-05, "loss": 0.6568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8154, "tokens_per_second_per_gpu": 10406.66, "total_tokens": 805261630 }, { "epoch": 0.5098149537384347, "grad_norm": 0.9200501441955566, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8155, "tokens_per_second_per_gpu": 10532.3, "total_tokens": 805359443 }, { "epoch": 0.5098774693673418, "grad_norm": 0.9101101160049438, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8156, "tokens_per_second_per_gpu": 11206.18, "total_tokens": 805458542 }, { "epoch": 0.509939984996249, "grad_norm": 0.90228271484375, "learning_rate": 2e-05, "loss": 0.6814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8157, "tokens_per_second_per_gpu": 10734.75, "total_tokens": 805560291 }, { "epoch": 0.5100025006251563, "grad_norm": 0.8989644050598145, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8158, "tokens_per_second_per_gpu": 10547.57, "total_tokens": 805655405 }, { "epoch": 0.5100650162540635, "grad_norm": 0.8837116360664368, "learning_rate": 2e-05, "loss": 0.6435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8159, "tokens_per_second_per_gpu": 10609.61, "total_tokens": 805755816 }, { "epoch": 0.5101275318829708, "grad_norm": 0.9371444582939148, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8160, "tokens_per_second_per_gpu": 10096.06, "total_tokens": 805853485 }, { "epoch": 0.510190047511878, "grad_norm": 0.8863374590873718, "learning_rate": 2e-05, "loss": 0.6922, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8161, "tokens_per_second_per_gpu": 10072.72, "total_tokens": 805953739 }, { "epoch": 0.5102525631407852, "grad_norm": 0.9007084965705872, "learning_rate": 2e-05, "loss": 0.6255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8162, "tokens_per_second_per_gpu": 10032.44, "total_tokens": 806049785 }, { "epoch": 0.5103150787696924, "grad_norm": 0.913101077079773, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8163, "tokens_per_second_per_gpu": 10389.17, "total_tokens": 806149362 }, { "epoch": 0.5103775943985996, "grad_norm": 0.9212192893028259, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8164, "tokens_per_second_per_gpu": 10782.95, "total_tokens": 806248697 }, { "epoch": 0.5104401100275069, "grad_norm": 0.9395683407783508, "learning_rate": 2e-05, "loss": 0.6706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8165, "tokens_per_second_per_gpu": 9095.84, "total_tokens": 806344501 }, { "epoch": 0.5105026256564141, "grad_norm": 0.873428225517273, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8166, "tokens_per_second_per_gpu": 10530.27, "total_tokens": 806441434 }, { "epoch": 0.5105651412853214, "grad_norm": 0.9215556383132935, "learning_rate": 2e-05, "loss": 0.6514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8167, "tokens_per_second_per_gpu": 11004.87, "total_tokens": 806540843 }, { "epoch": 0.5106276569142285, "grad_norm": 0.9800482988357544, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8168, "tokens_per_second_per_gpu": 10369.48, "total_tokens": 806638067 }, { "epoch": 0.5106901725431358, "grad_norm": 0.9287232756614685, "learning_rate": 2e-05, "loss": 0.6761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8169, "tokens_per_second_per_gpu": 9625.22, "total_tokens": 806732221 }, { "epoch": 0.510752688172043, "grad_norm": 0.8802853226661682, "learning_rate": 2e-05, "loss": 0.6548, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8170, "tokens_per_second_per_gpu": 10694.85, "total_tokens": 806830362 }, { "epoch": 0.5108152038009502, "grad_norm": 0.8574192523956299, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8171, "tokens_per_second_per_gpu": 10797.67, "total_tokens": 806929479 }, { "epoch": 0.5108777194298575, "grad_norm": 0.886939525604248, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8172, "tokens_per_second_per_gpu": 9844.65, "total_tokens": 807028584 }, { "epoch": 0.5109402350587647, "grad_norm": 0.913325309753418, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8173, "tokens_per_second_per_gpu": 9775.12, "total_tokens": 807122361 }, { "epoch": 0.5110027506876719, "grad_norm": 0.9092864990234375, "learning_rate": 2e-05, "loss": 0.683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8174, "tokens_per_second_per_gpu": 10936.0, "total_tokens": 807224065 }, { "epoch": 0.5110652663165791, "grad_norm": 0.9118441939353943, "learning_rate": 2e-05, "loss": 0.6489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8175, "tokens_per_second_per_gpu": 10636.85, "total_tokens": 807324165 }, { "epoch": 0.5111277819454864, "grad_norm": 0.9013127088546753, "learning_rate": 2e-05, "loss": 0.5867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8176, "tokens_per_second_per_gpu": 10540.05, "total_tokens": 807418341 }, { "epoch": 0.5111902975743936, "grad_norm": 0.8780710101127625, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8177, "tokens_per_second_per_gpu": 10634.9, "total_tokens": 807521175 }, { "epoch": 0.5112528132033008, "grad_norm": 0.8823028802871704, "learning_rate": 2e-05, "loss": 0.6614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8178, "tokens_per_second_per_gpu": 11395.25, "total_tokens": 807623491 }, { "epoch": 0.5113153288322081, "grad_norm": 0.9817783832550049, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8179, "tokens_per_second_per_gpu": 10765.15, "total_tokens": 807722477 }, { "epoch": 0.5113778444611152, "grad_norm": 0.9008329510688782, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8180, "tokens_per_second_per_gpu": 9935.85, "total_tokens": 807814751 }, { "epoch": 0.5114403600900225, "grad_norm": 0.9130802750587463, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8181, "tokens_per_second_per_gpu": 10385.01, "total_tokens": 807911313 }, { "epoch": 0.5115028757189297, "grad_norm": 0.9050057530403137, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8182, "tokens_per_second_per_gpu": 10564.28, "total_tokens": 808011168 }, { "epoch": 0.511565391347837, "grad_norm": 0.9425851702690125, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8183, "tokens_per_second_per_gpu": 10260.19, "total_tokens": 808106348 }, { "epoch": 0.5116279069767442, "grad_norm": 0.8955893516540527, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8184, "tokens_per_second_per_gpu": 9889.58, "total_tokens": 808204485 }, { "epoch": 0.5116904226056515, "grad_norm": 0.8807375431060791, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8185, "tokens_per_second_per_gpu": 10451.14, "total_tokens": 808304907 }, { "epoch": 0.5117529382345586, "grad_norm": 0.9386157393455505, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8186, "tokens_per_second_per_gpu": 10190.7, "total_tokens": 808399244 }, { "epoch": 0.5118154538634658, "grad_norm": 0.9298338294029236, "learning_rate": 2e-05, "loss": 0.6904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8187, "tokens_per_second_per_gpu": 10172.51, "total_tokens": 808497817 }, { "epoch": 0.5118779694923731, "grad_norm": 0.9396029114723206, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8188, "tokens_per_second_per_gpu": 10220.74, "total_tokens": 808590740 }, { "epoch": 0.5119404851212803, "grad_norm": 0.8696480393409729, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8189, "tokens_per_second_per_gpu": 10590.62, "total_tokens": 808688201 }, { "epoch": 0.5120030007501876, "grad_norm": 0.9061380624771118, "learning_rate": 2e-05, "loss": 0.6514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8190, "tokens_per_second_per_gpu": 9829.4, "total_tokens": 808780308 }, { "epoch": 0.5120655163790948, "grad_norm": 0.8725837469100952, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8191, "tokens_per_second_per_gpu": 10956.06, "total_tokens": 808880494 }, { "epoch": 0.512128032008002, "grad_norm": 0.8940046429634094, "learning_rate": 2e-05, "loss": 0.6826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8192, "tokens_per_second_per_gpu": 11254.29, "total_tokens": 808982950 }, { "epoch": 0.5121905476369092, "grad_norm": 0.904871940612793, "learning_rate": 2e-05, "loss": 0.6687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8193, "tokens_per_second_per_gpu": 10675.92, "total_tokens": 809084430 }, { "epoch": 0.5122530632658164, "grad_norm": 0.9278146028518677, "learning_rate": 2e-05, "loss": 0.6478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8194, "tokens_per_second_per_gpu": 10130.84, "total_tokens": 809176990 }, { "epoch": 0.5123155788947237, "grad_norm": 0.8951587080955505, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8195, "tokens_per_second_per_gpu": 9995.73, "total_tokens": 809275677 }, { "epoch": 0.5123780945236309, "grad_norm": 0.8847556710243225, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8196, "tokens_per_second_per_gpu": 10910.15, "total_tokens": 809370152 }, { "epoch": 0.5124406101525382, "grad_norm": 0.9144819974899292, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8197, "tokens_per_second_per_gpu": 10010.52, "total_tokens": 809467675 }, { "epoch": 0.5125031257814454, "grad_norm": 0.9236447811126709, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8198, "tokens_per_second_per_gpu": 10437.45, "total_tokens": 809565221 }, { "epoch": 0.5125656414103525, "grad_norm": 0.884067952632904, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8199, "tokens_per_second_per_gpu": 10498.98, "total_tokens": 809664781 }, { "epoch": 0.5126281570392598, "grad_norm": 0.8826280832290649, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8200, "tokens_per_second_per_gpu": 10697.79, "total_tokens": 809764679 }, { "epoch": 0.512690672668167, "grad_norm": 0.9072520136833191, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8201, "tokens_per_second_per_gpu": 10338.77, "total_tokens": 809859448 }, { "epoch": 0.5127531882970743, "grad_norm": 0.8839834332466125, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8202, "tokens_per_second_per_gpu": 10599.54, "total_tokens": 809957651 }, { "epoch": 0.5128157039259815, "grad_norm": 0.9300054907798767, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8203, "tokens_per_second_per_gpu": 11823.53, "total_tokens": 810057565 }, { "epoch": 0.5128782195548888, "grad_norm": 0.894305408000946, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8204, "tokens_per_second_per_gpu": 11284.75, "total_tokens": 810158134 }, { "epoch": 0.5129407351837959, "grad_norm": 0.9442243576049805, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8205, "tokens_per_second_per_gpu": 10298.35, "total_tokens": 810254783 }, { "epoch": 0.5130032508127031, "grad_norm": 0.8974610567092896, "learning_rate": 2e-05, "loss": 0.6577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8206, "tokens_per_second_per_gpu": 9700.91, "total_tokens": 810350558 }, { "epoch": 0.5130657664416104, "grad_norm": 0.8604193925857544, "learning_rate": 2e-05, "loss": 0.6338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8207, "tokens_per_second_per_gpu": 10429.16, "total_tokens": 810447421 }, { "epoch": 0.5131282820705176, "grad_norm": 0.9007946848869324, "learning_rate": 2e-05, "loss": 0.6687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8208, "tokens_per_second_per_gpu": 10305.15, "total_tokens": 810546209 }, { "epoch": 0.5131907976994249, "grad_norm": 0.9288353323936462, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8209, "tokens_per_second_per_gpu": 9551.42, "total_tokens": 810637400 }, { "epoch": 0.5132533133283321, "grad_norm": 0.8713381290435791, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8210, "tokens_per_second_per_gpu": 10277.54, "total_tokens": 810733487 }, { "epoch": 0.5133158289572393, "grad_norm": 0.8764718770980835, "learning_rate": 2e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8211, "tokens_per_second_per_gpu": 10423.55, "total_tokens": 810829222 }, { "epoch": 0.5133783445861465, "grad_norm": 0.8799816370010376, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8212, "tokens_per_second_per_gpu": 10953.46, "total_tokens": 810928789 }, { "epoch": 0.5134408602150538, "grad_norm": 0.9028106331825256, "learning_rate": 2e-05, "loss": 0.6722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8213, "tokens_per_second_per_gpu": 10845.51, "total_tokens": 811027569 }, { "epoch": 0.513503375843961, "grad_norm": 0.8834346532821655, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8214, "tokens_per_second_per_gpu": 10168.03, "total_tokens": 811122029 }, { "epoch": 0.5135658914728682, "grad_norm": 0.9097576141357422, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8215, "tokens_per_second_per_gpu": 10760.82, "total_tokens": 811222386 }, { "epoch": 0.5136284071017755, "grad_norm": 0.9275193810462952, "learning_rate": 2e-05, "loss": 0.6875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8216, "tokens_per_second_per_gpu": 10145.03, "total_tokens": 811320743 }, { "epoch": 0.5136909227306826, "grad_norm": 0.9068028330802917, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8217, "tokens_per_second_per_gpu": 10960.77, "total_tokens": 811420964 }, { "epoch": 0.5137534383595899, "grad_norm": 0.9397684335708618, "learning_rate": 2e-05, "loss": 0.6294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8218, "tokens_per_second_per_gpu": 10159.62, "total_tokens": 811519301 }, { "epoch": 0.5138159539884971, "grad_norm": 0.9322944283485413, "learning_rate": 2e-05, "loss": 0.6518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8219, "tokens_per_second_per_gpu": 9926.71, "total_tokens": 811613875 }, { "epoch": 0.5138784696174044, "grad_norm": 0.8654453158378601, "learning_rate": 2e-05, "loss": 0.6468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8220, "tokens_per_second_per_gpu": 11582.46, "total_tokens": 811720046 }, { "epoch": 0.5139409852463116, "grad_norm": 0.875421404838562, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8221, "tokens_per_second_per_gpu": 10622.57, "total_tokens": 811816786 }, { "epoch": 0.5140035008752188, "grad_norm": 1.0459128618240356, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8222, "tokens_per_second_per_gpu": 10753.92, "total_tokens": 811914606 }, { "epoch": 0.514066016504126, "grad_norm": 0.8930161595344543, "learning_rate": 2e-05, "loss": 0.6011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8223, "tokens_per_second_per_gpu": 9854.21, "total_tokens": 812008064 }, { "epoch": 0.5141285321330332, "grad_norm": 0.8866321444511414, "learning_rate": 2e-05, "loss": 0.652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8224, "tokens_per_second_per_gpu": 10839.04, "total_tokens": 812109731 }, { "epoch": 0.5141910477619405, "grad_norm": 0.9306619763374329, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8225, "tokens_per_second_per_gpu": 10180.6, "total_tokens": 812208394 }, { "epoch": 0.5142535633908477, "grad_norm": 0.9282443523406982, "learning_rate": 2e-05, "loss": 0.6847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8226, "tokens_per_second_per_gpu": 10019.83, "total_tokens": 812303997 }, { "epoch": 0.514316079019755, "grad_norm": 0.9111631512641907, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8227, "tokens_per_second_per_gpu": 9456.37, "total_tokens": 812395090 }, { "epoch": 0.5143785946486622, "grad_norm": 0.871998131275177, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8228, "tokens_per_second_per_gpu": 10697.21, "total_tokens": 812491101 }, { "epoch": 0.5144411102775694, "grad_norm": 0.9241656064987183, "learning_rate": 2e-05, "loss": 0.659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8229, "tokens_per_second_per_gpu": 10814.73, "total_tokens": 812590401 }, { "epoch": 0.5145036259064766, "grad_norm": 0.8667855262756348, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8230, "tokens_per_second_per_gpu": 10311.54, "total_tokens": 812688657 }, { "epoch": 0.5145661415353838, "grad_norm": 0.9120312333106995, "learning_rate": 2e-05, "loss": 0.6714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8231, "tokens_per_second_per_gpu": 10157.29, "total_tokens": 812785446 }, { "epoch": 0.5146286571642911, "grad_norm": 0.9111056923866272, "learning_rate": 2e-05, "loss": 0.6757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8232, "tokens_per_second_per_gpu": 10457.03, "total_tokens": 812885202 }, { "epoch": 0.5146911727931983, "grad_norm": 0.9934702515602112, "learning_rate": 2e-05, "loss": 0.6898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8233, "tokens_per_second_per_gpu": 10393.75, "total_tokens": 812982846 }, { "epoch": 0.5147536884221056, "grad_norm": 0.8967097401618958, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8234, "tokens_per_second_per_gpu": 10419.03, "total_tokens": 813082942 }, { "epoch": 0.5148162040510128, "grad_norm": 0.867838978767395, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8235, "tokens_per_second_per_gpu": 10425.11, "total_tokens": 813180782 }, { "epoch": 0.5148787196799199, "grad_norm": 0.9138678312301636, "learning_rate": 2e-05, "loss": 0.6742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8236, "tokens_per_second_per_gpu": 10913.14, "total_tokens": 813276965 }, { "epoch": 0.5149412353088272, "grad_norm": 0.8877512812614441, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8237, "tokens_per_second_per_gpu": 9950.13, "total_tokens": 813373425 }, { "epoch": 0.5150037509377344, "grad_norm": 0.910088062286377, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8238, "tokens_per_second_per_gpu": 10829.61, "total_tokens": 813471504 }, { "epoch": 0.5150662665666417, "grad_norm": 0.9176217317581177, "learning_rate": 2e-05, "loss": 0.6568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8239, "tokens_per_second_per_gpu": 10075.04, "total_tokens": 813565897 }, { "epoch": 0.5151287821955489, "grad_norm": 0.9196051955223083, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8240, "tokens_per_second_per_gpu": 9869.2, "total_tokens": 813662630 }, { "epoch": 0.5151912978244562, "grad_norm": 0.889261782169342, "learning_rate": 2e-05, "loss": 0.6205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8241, "tokens_per_second_per_gpu": 9817.76, "total_tokens": 813759091 }, { "epoch": 0.5152538134533633, "grad_norm": 0.957882821559906, "learning_rate": 2e-05, "loss": 0.6623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8242, "tokens_per_second_per_gpu": 10166.48, "total_tokens": 813854643 }, { "epoch": 0.5153163290822705, "grad_norm": 0.9165768027305603, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8243, "tokens_per_second_per_gpu": 10073.78, "total_tokens": 813952811 }, { "epoch": 0.5153788447111778, "grad_norm": 0.8911260962486267, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8244, "tokens_per_second_per_gpu": 9667.04, "total_tokens": 814045053 }, { "epoch": 0.515441360340085, "grad_norm": 0.9116860628128052, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8245, "tokens_per_second_per_gpu": 10329.72, "total_tokens": 814138491 }, { "epoch": 0.5155038759689923, "grad_norm": 0.867050290107727, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8246, "tokens_per_second_per_gpu": 11014.73, "total_tokens": 814240794 }, { "epoch": 0.5155663915978995, "grad_norm": 0.8863732218742371, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8247, "tokens_per_second_per_gpu": 10084.93, "total_tokens": 814338087 }, { "epoch": 0.5156289072268067, "grad_norm": 0.8885990381240845, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8248, "tokens_per_second_per_gpu": 10780.84, "total_tokens": 814432268 }, { "epoch": 0.5156914228557139, "grad_norm": 0.8976847529411316, "learning_rate": 2e-05, "loss": 0.6803, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8249, "tokens_per_second_per_gpu": 10451.86, "total_tokens": 814532620 }, { "epoch": 0.5157539384846211, "grad_norm": 0.8975993394851685, "learning_rate": 2e-05, "loss": 0.6753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8250, "tokens_per_second_per_gpu": 10860.1, "total_tokens": 814633871 }, { "epoch": 0.5158164541135284, "grad_norm": 0.9178313612937927, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8251, "tokens_per_second_per_gpu": 10616.93, "total_tokens": 814734878 }, { "epoch": 0.5158789697424356, "grad_norm": 0.8911881446838379, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8252, "tokens_per_second_per_gpu": 10786.35, "total_tokens": 814834442 }, { "epoch": 0.5159414853713429, "grad_norm": 0.8991373181343079, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8253, "tokens_per_second_per_gpu": 10397.37, "total_tokens": 814930726 }, { "epoch": 0.51600400100025, "grad_norm": 0.8940724730491638, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8254, "tokens_per_second_per_gpu": 10192.74, "total_tokens": 815026094 }, { "epoch": 0.5160665166291573, "grad_norm": 0.893468976020813, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8255, "tokens_per_second_per_gpu": 10745.96, "total_tokens": 815123861 }, { "epoch": 0.5161290322580645, "grad_norm": 0.8889527916908264, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8256, "tokens_per_second_per_gpu": 10953.42, "total_tokens": 815223677 }, { "epoch": 0.5161915478869717, "grad_norm": 0.9177516102790833, "learning_rate": 2e-05, "loss": 0.6831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8257, "tokens_per_second_per_gpu": 10779.94, "total_tokens": 815323130 }, { "epoch": 0.516254063515879, "grad_norm": 0.89945387840271, "learning_rate": 2e-05, "loss": 0.663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8258, "tokens_per_second_per_gpu": 11868.24, "total_tokens": 815419586 }, { "epoch": 0.5163165791447862, "grad_norm": 0.8623560667037964, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8259, "tokens_per_second_per_gpu": 10694.24, "total_tokens": 815522768 }, { "epoch": 0.5163790947736934, "grad_norm": 0.8524755239486694, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8260, "tokens_per_second_per_gpu": 10841.82, "total_tokens": 815622004 }, { "epoch": 0.5164416104026006, "grad_norm": 0.851215660572052, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8261, "tokens_per_second_per_gpu": 10178.75, "total_tokens": 815721362 }, { "epoch": 0.5165041260315079, "grad_norm": 0.8839423060417175, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8262, "tokens_per_second_per_gpu": 10403.37, "total_tokens": 815820384 }, { "epoch": 0.5165666416604151, "grad_norm": 0.9671895503997803, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8263, "tokens_per_second_per_gpu": 9892.53, "total_tokens": 815912409 }, { "epoch": 0.5166291572893223, "grad_norm": 0.923629641532898, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8264, "tokens_per_second_per_gpu": 9799.22, "total_tokens": 816008019 }, { "epoch": 0.5166916729182296, "grad_norm": 0.9150856137275696, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8265, "tokens_per_second_per_gpu": 11343.61, "total_tokens": 816109643 }, { "epoch": 0.5167541885471368, "grad_norm": 0.8958181142807007, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8266, "tokens_per_second_per_gpu": 9897.57, "total_tokens": 816203319 }, { "epoch": 0.516816704176044, "grad_norm": 0.9012343883514404, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8267, "tokens_per_second_per_gpu": 11018.51, "total_tokens": 816305253 }, { "epoch": 0.5168792198049512, "grad_norm": 0.9348307847976685, "learning_rate": 2e-05, "loss": 0.6719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8268, "tokens_per_second_per_gpu": 10139.44, "total_tokens": 816400364 }, { "epoch": 0.5169417354338585, "grad_norm": 0.8870413899421692, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8269, "tokens_per_second_per_gpu": 10712.72, "total_tokens": 816501555 }, { "epoch": 0.5170042510627657, "grad_norm": 0.8897355794906616, "learning_rate": 2e-05, "loss": 0.6374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8270, "tokens_per_second_per_gpu": 9399.1, "total_tokens": 816596469 }, { "epoch": 0.517066766691673, "grad_norm": 0.9059029221534729, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8271, "tokens_per_second_per_gpu": 9827.95, "total_tokens": 816689899 }, { "epoch": 0.5171292823205802, "grad_norm": 0.9017872214317322, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8272, "tokens_per_second_per_gpu": 11226.02, "total_tokens": 816787021 }, { "epoch": 0.5171917979494873, "grad_norm": 1.0221359729766846, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8273, "tokens_per_second_per_gpu": 10361.16, "total_tokens": 816884255 }, { "epoch": 0.5172543135783946, "grad_norm": 0.9044672250747681, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8274, "tokens_per_second_per_gpu": 10119.56, "total_tokens": 816978117 }, { "epoch": 0.5173168292073018, "grad_norm": 0.9035953283309937, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8275, "tokens_per_second_per_gpu": 10021.89, "total_tokens": 817072462 }, { "epoch": 0.5173793448362091, "grad_norm": 0.8773500323295593, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8276, "tokens_per_second_per_gpu": 10777.33, "total_tokens": 817169462 }, { "epoch": 0.5174418604651163, "grad_norm": 0.9050483107566833, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8277, "tokens_per_second_per_gpu": 10961.22, "total_tokens": 817272348 }, { "epoch": 0.5175043760940236, "grad_norm": 0.9115346670150757, "learning_rate": 2e-05, "loss": 0.6658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8278, "tokens_per_second_per_gpu": 9097.33, "total_tokens": 817365262 }, { "epoch": 0.5175668917229307, "grad_norm": 0.9032586812973022, "learning_rate": 2e-05, "loss": 0.6592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8279, "tokens_per_second_per_gpu": 10143.41, "total_tokens": 817463722 }, { "epoch": 0.5176294073518379, "grad_norm": 0.8884122371673584, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8280, "tokens_per_second_per_gpu": 9733.47, "total_tokens": 817558852 }, { "epoch": 0.5176919229807452, "grad_norm": 0.9446884989738464, "learning_rate": 2e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8281, "tokens_per_second_per_gpu": 9477.07, "total_tokens": 817651374 }, { "epoch": 0.5177544386096524, "grad_norm": 0.9187116622924805, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8282, "tokens_per_second_per_gpu": 11030.15, "total_tokens": 817748988 }, { "epoch": 0.5178169542385597, "grad_norm": 0.9246735572814941, "learning_rate": 2e-05, "loss": 0.6623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8283, "tokens_per_second_per_gpu": 10403.2, "total_tokens": 817845841 }, { "epoch": 0.5178794698674669, "grad_norm": 0.8965832591056824, "learning_rate": 2e-05, "loss": 0.6762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8284, "tokens_per_second_per_gpu": 11197.44, "total_tokens": 817945578 }, { "epoch": 0.517941985496374, "grad_norm": 0.907566487789154, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8285, "tokens_per_second_per_gpu": 10187.46, "total_tokens": 818045000 }, { "epoch": 0.5180045011252813, "grad_norm": 0.8757901787757874, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8286, "tokens_per_second_per_gpu": 10377.51, "total_tokens": 818146531 }, { "epoch": 0.5180670167541885, "grad_norm": 0.8523221015930176, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8287, "tokens_per_second_per_gpu": 10219.51, "total_tokens": 818247793 }, { "epoch": 0.5181295323830958, "grad_norm": 0.911765456199646, "learning_rate": 2e-05, "loss": 0.7148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8288, "tokens_per_second_per_gpu": 10769.08, "total_tokens": 818347720 }, { "epoch": 0.518192048012003, "grad_norm": 0.9104452133178711, "learning_rate": 2e-05, "loss": 0.6677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8289, "tokens_per_second_per_gpu": 11049.72, "total_tokens": 818443739 }, { "epoch": 0.5182545636409103, "grad_norm": 0.9061303734779358, "learning_rate": 2e-05, "loss": 0.6286, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8290, "tokens_per_second_per_gpu": 10143.31, "total_tokens": 818542350 }, { "epoch": 0.5183170792698174, "grad_norm": 0.9568086862564087, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8291, "tokens_per_second_per_gpu": 11239.96, "total_tokens": 818643344 }, { "epoch": 0.5183795948987246, "grad_norm": 0.8955398201942444, "learning_rate": 2e-05, "loss": 0.7039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8292, "tokens_per_second_per_gpu": 10651.32, "total_tokens": 818744093 }, { "epoch": 0.5184421105276319, "grad_norm": 0.8713249564170837, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8293, "tokens_per_second_per_gpu": 10036.17, "total_tokens": 818840887 }, { "epoch": 0.5185046261565391, "grad_norm": 0.9534803032875061, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8294, "tokens_per_second_per_gpu": 10304.41, "total_tokens": 818938378 }, { "epoch": 0.5185671417854464, "grad_norm": 0.9277481436729431, "learning_rate": 2e-05, "loss": 0.6618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8295, "tokens_per_second_per_gpu": 10157.69, "total_tokens": 819030799 }, { "epoch": 0.5186296574143536, "grad_norm": 0.8946969509124756, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8296, "tokens_per_second_per_gpu": 10484.34, "total_tokens": 819124760 }, { "epoch": 0.5186921730432608, "grad_norm": 0.9199209213256836, "learning_rate": 2e-05, "loss": 0.6364, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8297, "tokens_per_second_per_gpu": 10493.44, "total_tokens": 819217884 }, { "epoch": 0.518754688672168, "grad_norm": 0.90970778465271, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8298, "tokens_per_second_per_gpu": 10281.07, "total_tokens": 819314129 }, { "epoch": 0.5188172043010753, "grad_norm": 0.9582369327545166, "learning_rate": 2e-05, "loss": 0.6679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8299, "tokens_per_second_per_gpu": 9422.51, "total_tokens": 819407354 }, { "epoch": 0.5188797199299825, "grad_norm": 0.8879996538162231, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8300, "tokens_per_second_per_gpu": 10828.33, "total_tokens": 819502632 }, { "epoch": 0.5189422355588897, "grad_norm": 0.893311619758606, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8301, "tokens_per_second_per_gpu": 9610.24, "total_tokens": 819596714 }, { "epoch": 0.519004751187797, "grad_norm": 0.9750294089317322, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8302, "tokens_per_second_per_gpu": 9599.26, "total_tokens": 819691962 }, { "epoch": 0.5190672668167042, "grad_norm": 0.9182999730110168, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8303, "tokens_per_second_per_gpu": 10141.66, "total_tokens": 819791580 }, { "epoch": 0.5191297824456114, "grad_norm": 0.9253765940666199, "learning_rate": 2e-05, "loss": 0.603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8304, "tokens_per_second_per_gpu": 9272.41, "total_tokens": 819881671 }, { "epoch": 0.5191922980745186, "grad_norm": 0.9066177606582642, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8305, "tokens_per_second_per_gpu": 10512.9, "total_tokens": 819977333 }, { "epoch": 0.5192548137034259, "grad_norm": 0.912987470626831, "learning_rate": 2e-05, "loss": 0.6677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8306, "tokens_per_second_per_gpu": 9538.9, "total_tokens": 820067683 }, { "epoch": 0.5193173293323331, "grad_norm": 0.9098270535469055, "learning_rate": 2e-05, "loss": 0.6426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8307, "tokens_per_second_per_gpu": 10986.32, "total_tokens": 820165558 }, { "epoch": 0.5193798449612403, "grad_norm": 0.893595814704895, "learning_rate": 2e-05, "loss": 0.6473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8308, "tokens_per_second_per_gpu": 10817.37, "total_tokens": 820261346 }, { "epoch": 0.5194423605901476, "grad_norm": 1.9086815118789673, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8309, "tokens_per_second_per_gpu": 11028.21, "total_tokens": 820356687 }, { "epoch": 0.5195048762190547, "grad_norm": 0.897465705871582, "learning_rate": 2e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8310, "tokens_per_second_per_gpu": 17159.06, "total_tokens": 820451251 }, { "epoch": 0.519567391847962, "grad_norm": 0.9257897138595581, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8311, "tokens_per_second_per_gpu": 16972.88, "total_tokens": 820551429 }, { "epoch": 0.5196299074768692, "grad_norm": 0.9105944037437439, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8312, "tokens_per_second_per_gpu": 17194.64, "total_tokens": 820649194 }, { "epoch": 0.5196924231057765, "grad_norm": 0.935206413269043, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8313, "tokens_per_second_per_gpu": 18412.78, "total_tokens": 820750497 }, { "epoch": 0.5197549387346837, "grad_norm": 0.9543895125389099, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8314, "tokens_per_second_per_gpu": 15817.38, "total_tokens": 820844610 }, { "epoch": 0.519817454363591, "grad_norm": 0.9308919310569763, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8315, "tokens_per_second_per_gpu": 17346.03, "total_tokens": 820944314 }, { "epoch": 0.5198799699924981, "grad_norm": 0.8905717730522156, "learning_rate": 2e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8316, "tokens_per_second_per_gpu": 16220.33, "total_tokens": 821040314 }, { "epoch": 0.5199424856214053, "grad_norm": 0.8853635191917419, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8317, "tokens_per_second_per_gpu": 16384.94, "total_tokens": 821135327 }, { "epoch": 0.5200050012503126, "grad_norm": 0.890942394733429, "learning_rate": 2e-05, "loss": 0.6461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8318, "tokens_per_second_per_gpu": 18157.81, "total_tokens": 821239314 }, { "epoch": 0.5200675168792198, "grad_norm": 0.9061707258224487, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8319, "tokens_per_second_per_gpu": 17960.28, "total_tokens": 821336078 }, { "epoch": 0.5201300325081271, "grad_norm": 0.8779898285865784, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8320, "tokens_per_second_per_gpu": 18480.31, "total_tokens": 821437600 }, { "epoch": 0.5201925481370343, "grad_norm": 0.9164010286331177, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8321, "tokens_per_second_per_gpu": 16131.08, "total_tokens": 821532011 }, { "epoch": 0.5202550637659414, "grad_norm": 0.8994883298873901, "learning_rate": 2e-05, "loss": 0.6034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8322, "tokens_per_second_per_gpu": 16029.3, "total_tokens": 821625240 }, { "epoch": 0.5203175793948487, "grad_norm": 0.9431034326553345, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8323, "tokens_per_second_per_gpu": 16090.7, "total_tokens": 821720598 }, { "epoch": 0.5203800950237559, "grad_norm": 0.9024908542633057, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8324, "tokens_per_second_per_gpu": 16850.65, "total_tokens": 821818802 }, { "epoch": 0.5204426106526632, "grad_norm": 0.83833909034729, "learning_rate": 2e-05, "loss": 0.5935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8325, "tokens_per_second_per_gpu": 17129.05, "total_tokens": 821916483 }, { "epoch": 0.5205051262815704, "grad_norm": 0.8921447396278381, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8326, "tokens_per_second_per_gpu": 11552.47, "total_tokens": 822016974 }, { "epoch": 0.5205676419104777, "grad_norm": 0.9054815769195557, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8327, "tokens_per_second_per_gpu": 9990.17, "total_tokens": 822114068 }, { "epoch": 0.5206301575393848, "grad_norm": 0.8683496713638306, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8328, "tokens_per_second_per_gpu": 10567.46, "total_tokens": 822212881 }, { "epoch": 0.520692673168292, "grad_norm": 0.926194965839386, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8329, "tokens_per_second_per_gpu": 10218.5, "total_tokens": 822308747 }, { "epoch": 0.5207551887971993, "grad_norm": 0.9042254686355591, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8330, "tokens_per_second_per_gpu": 9829.65, "total_tokens": 822401387 }, { "epoch": 0.5208177044261065, "grad_norm": 0.8609461784362793, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8331, "tokens_per_second_per_gpu": 11040.18, "total_tokens": 822503220 }, { "epoch": 0.5208802200550138, "grad_norm": 0.8956277370452881, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8332, "tokens_per_second_per_gpu": 10046.28, "total_tokens": 822603378 }, { "epoch": 0.520942735683921, "grad_norm": 0.9089882969856262, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8333, "tokens_per_second_per_gpu": 9636.84, "total_tokens": 822699691 }, { "epoch": 0.5210052513128282, "grad_norm": 0.8886786103248596, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8334, "tokens_per_second_per_gpu": 10977.82, "total_tokens": 822801789 }, { "epoch": 0.5210677669417354, "grad_norm": 0.8960942029953003, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8335, "tokens_per_second_per_gpu": 10605.12, "total_tokens": 822899127 }, { "epoch": 0.5211302825706426, "grad_norm": 0.8801491856575012, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8336, "tokens_per_second_per_gpu": 11558.87, "total_tokens": 823002908 }, { "epoch": 0.5211927981995499, "grad_norm": 0.9118766784667969, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8337, "tokens_per_second_per_gpu": 11544.42, "total_tokens": 823099797 }, { "epoch": 0.5212553138284571, "grad_norm": 0.9086940884590149, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8338, "tokens_per_second_per_gpu": 10219.18, "total_tokens": 823194366 }, { "epoch": 0.5213178294573644, "grad_norm": 0.913404643535614, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8339, "tokens_per_second_per_gpu": 10556.41, "total_tokens": 823287568 }, { "epoch": 0.5213803450862715, "grad_norm": 0.8892742991447449, "learning_rate": 2e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8340, "tokens_per_second_per_gpu": 10263.33, "total_tokens": 823384866 }, { "epoch": 0.5214428607151788, "grad_norm": 0.9253393411636353, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8341, "tokens_per_second_per_gpu": 9916.51, "total_tokens": 823482735 }, { "epoch": 0.521505376344086, "grad_norm": 0.8788371682167053, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8342, "tokens_per_second_per_gpu": 10531.63, "total_tokens": 823580494 }, { "epoch": 0.5215678919729932, "grad_norm": 0.8866180777549744, "learning_rate": 2e-05, "loss": 0.5939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8343, "tokens_per_second_per_gpu": 10463.27, "total_tokens": 823679036 }, { "epoch": 0.5216304076019005, "grad_norm": 0.8919986486434937, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8344, "tokens_per_second_per_gpu": 10815.72, "total_tokens": 823777747 }, { "epoch": 0.5216929232308077, "grad_norm": 0.9172625541687012, "learning_rate": 2e-05, "loss": 0.6511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8345, "tokens_per_second_per_gpu": 10941.1, "total_tokens": 823882040 }, { "epoch": 0.521755438859715, "grad_norm": 0.8827292323112488, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8346, "tokens_per_second_per_gpu": 10031.37, "total_tokens": 823979266 }, { "epoch": 0.5218179544886221, "grad_norm": 0.9873180389404297, "learning_rate": 2e-05, "loss": 0.5825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8347, "tokens_per_second_per_gpu": 9521.97, "total_tokens": 824072451 }, { "epoch": 0.5218804701175294, "grad_norm": 0.9145336151123047, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8348, "tokens_per_second_per_gpu": 10733.54, "total_tokens": 824172978 }, { "epoch": 0.5219429857464366, "grad_norm": 0.8312812447547913, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8349, "tokens_per_second_per_gpu": 10766.43, "total_tokens": 824277125 }, { "epoch": 0.5220055013753438, "grad_norm": 0.9592630863189697, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8350, "tokens_per_second_per_gpu": 9928.75, "total_tokens": 824373775 }, { "epoch": 0.5220680170042511, "grad_norm": 0.9037217497825623, "learning_rate": 2e-05, "loss": 0.6858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8351, "tokens_per_second_per_gpu": 10503.46, "total_tokens": 824475429 }, { "epoch": 0.5221305326331583, "grad_norm": 0.8961014151573181, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8352, "tokens_per_second_per_gpu": 10615.24, "total_tokens": 824570366 }, { "epoch": 0.5221930482620655, "grad_norm": 0.9207763671875, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8353, "tokens_per_second_per_gpu": 10481.52, "total_tokens": 824670193 }, { "epoch": 0.5222555638909727, "grad_norm": 0.8940718173980713, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8354, "tokens_per_second_per_gpu": 10569.93, "total_tokens": 824770260 }, { "epoch": 0.52231807951988, "grad_norm": 0.949187159538269, "learning_rate": 2e-05, "loss": 0.6608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8355, "tokens_per_second_per_gpu": 10752.89, "total_tokens": 824871891 }, { "epoch": 0.5223805951487872, "grad_norm": 0.8673849701881409, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8356, "tokens_per_second_per_gpu": 10779.28, "total_tokens": 824969188 }, { "epoch": 0.5224431107776945, "grad_norm": 0.9537292718887329, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8357, "tokens_per_second_per_gpu": 10163.87, "total_tokens": 825067071 }, { "epoch": 0.5225056264066017, "grad_norm": 0.9005259275436401, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8358, "tokens_per_second_per_gpu": 11231.14, "total_tokens": 825169144 }, { "epoch": 0.5225681420355088, "grad_norm": 0.8742976188659668, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8359, "tokens_per_second_per_gpu": 11174.86, "total_tokens": 825266252 }, { "epoch": 0.5226306576644161, "grad_norm": 0.8758542537689209, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8360, "tokens_per_second_per_gpu": 10671.7, "total_tokens": 825367409 }, { "epoch": 0.5226931732933233, "grad_norm": 0.8580408096313477, "learning_rate": 2e-05, "loss": 0.6676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8361, "tokens_per_second_per_gpu": 10886.22, "total_tokens": 825470396 }, { "epoch": 0.5227556889222306, "grad_norm": 0.9016098380088806, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8362, "tokens_per_second_per_gpu": 10171.89, "total_tokens": 825565999 }, { "epoch": 0.5228182045511378, "grad_norm": 0.8560010194778442, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8363, "tokens_per_second_per_gpu": 10478.89, "total_tokens": 825667023 }, { "epoch": 0.522880720180045, "grad_norm": 0.9052055478096008, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8364, "tokens_per_second_per_gpu": 11287.27, "total_tokens": 825766044 }, { "epoch": 0.5229432358089522, "grad_norm": 0.8680623769760132, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8365, "tokens_per_second_per_gpu": 10333.06, "total_tokens": 825864704 }, { "epoch": 0.5230057514378594, "grad_norm": 0.8821583986282349, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8366, "tokens_per_second_per_gpu": 10121.87, "total_tokens": 825961456 }, { "epoch": 0.5230682670667667, "grad_norm": 0.8986284732818604, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8367, "tokens_per_second_per_gpu": 10202.01, "total_tokens": 826056912 }, { "epoch": 0.5231307826956739, "grad_norm": 0.9340334534645081, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8368, "tokens_per_second_per_gpu": 11205.37, "total_tokens": 826156503 }, { "epoch": 0.5231932983245812, "grad_norm": 0.8802947402000427, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8369, "tokens_per_second_per_gpu": 10977.01, "total_tokens": 826254044 }, { "epoch": 0.5232558139534884, "grad_norm": 0.8815593123435974, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8370, "tokens_per_second_per_gpu": 10910.65, "total_tokens": 826351746 }, { "epoch": 0.5233183295823955, "grad_norm": 0.8874439001083374, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8371, "tokens_per_second_per_gpu": 10659.86, "total_tokens": 826449732 }, { "epoch": 0.5233808452113028, "grad_norm": 0.8683885335922241, "learning_rate": 2e-05, "loss": 0.6754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8372, "tokens_per_second_per_gpu": 10645.11, "total_tokens": 826552279 }, { "epoch": 0.52344336084021, "grad_norm": 0.8866203427314758, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8373, "tokens_per_second_per_gpu": 10089.39, "total_tokens": 826651375 }, { "epoch": 0.5235058764691173, "grad_norm": 0.8663156032562256, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8374, "tokens_per_second_per_gpu": 10635.85, "total_tokens": 826751122 }, { "epoch": 0.5235683920980245, "grad_norm": 0.9377747178077698, "learning_rate": 2e-05, "loss": 0.6479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8375, "tokens_per_second_per_gpu": 11292.73, "total_tokens": 826849594 }, { "epoch": 0.5236309077269318, "grad_norm": 0.924919605255127, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8376, "tokens_per_second_per_gpu": 10439.77, "total_tokens": 826944897 }, { "epoch": 0.5236934233558389, "grad_norm": 0.8532626032829285, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8377, "tokens_per_second_per_gpu": 10718.03, "total_tokens": 827045023 }, { "epoch": 0.5237559389847461, "grad_norm": 0.8803242444992065, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8378, "tokens_per_second_per_gpu": 10298.39, "total_tokens": 827141767 }, { "epoch": 0.5238184546136534, "grad_norm": 0.8954374194145203, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8379, "tokens_per_second_per_gpu": 9902.35, "total_tokens": 827240103 }, { "epoch": 0.5238809702425606, "grad_norm": 0.8619998097419739, "learning_rate": 2e-05, "loss": 0.5847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8380, "tokens_per_second_per_gpu": 10826.09, "total_tokens": 827337861 }, { "epoch": 0.5239434858714679, "grad_norm": 0.8698464632034302, "learning_rate": 2e-05, "loss": 0.6706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8381, "tokens_per_second_per_gpu": 11653.16, "total_tokens": 827439752 }, { "epoch": 0.5240060015003751, "grad_norm": 0.8863192200660706, "learning_rate": 2e-05, "loss": 0.6722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8382, "tokens_per_second_per_gpu": 10057.71, "total_tokens": 827538188 }, { "epoch": 0.5240685171292824, "grad_norm": 0.8938696980476379, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8383, "tokens_per_second_per_gpu": 10780.37, "total_tokens": 827637420 }, { "epoch": 0.5241310327581895, "grad_norm": 0.9121155142784119, "learning_rate": 2e-05, "loss": 0.6789, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8384, "tokens_per_second_per_gpu": 10550.58, "total_tokens": 827737194 }, { "epoch": 0.5241935483870968, "grad_norm": 0.9014667272567749, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8385, "tokens_per_second_per_gpu": 10385.01, "total_tokens": 827834595 }, { "epoch": 0.524256064016004, "grad_norm": 0.8868529796600342, "learning_rate": 2e-05, "loss": 0.6255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8386, "tokens_per_second_per_gpu": 10919.73, "total_tokens": 827935545 }, { "epoch": 0.5243185796449112, "grad_norm": 0.9021631479263306, "learning_rate": 2e-05, "loss": 0.684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8387, "tokens_per_second_per_gpu": 10706.79, "total_tokens": 828032035 }, { "epoch": 0.5243810952738185, "grad_norm": 0.8751550316810608, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8388, "tokens_per_second_per_gpu": 9983.95, "total_tokens": 828127147 }, { "epoch": 0.5244436109027257, "grad_norm": 0.8784644603729248, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8389, "tokens_per_second_per_gpu": 10777.35, "total_tokens": 828227313 }, { "epoch": 0.5245061265316329, "grad_norm": 0.8731279969215393, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8390, "tokens_per_second_per_gpu": 10487.67, "total_tokens": 828324836 }, { "epoch": 0.5245686421605401, "grad_norm": 0.8420231938362122, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8391, "tokens_per_second_per_gpu": 10421.91, "total_tokens": 828427115 }, { "epoch": 0.5246311577894474, "grad_norm": 0.8894968032836914, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8392, "tokens_per_second_per_gpu": 10005.5, "total_tokens": 828525573 }, { "epoch": 0.5246936734183546, "grad_norm": 0.8656946420669556, "learning_rate": 2e-05, "loss": 0.5973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8393, "tokens_per_second_per_gpu": 10840.69, "total_tokens": 828626804 }, { "epoch": 0.5247561890472618, "grad_norm": 0.9161035418510437, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8394, "tokens_per_second_per_gpu": 10546.66, "total_tokens": 828725211 }, { "epoch": 0.5248187046761691, "grad_norm": 0.8702794909477234, "learning_rate": 2e-05, "loss": 0.6694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8395, "tokens_per_second_per_gpu": 10649.5, "total_tokens": 828829238 }, { "epoch": 0.5248812203050762, "grad_norm": 0.8518844842910767, "learning_rate": 2e-05, "loss": 0.5779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8396, "tokens_per_second_per_gpu": 10595.69, "total_tokens": 828928666 }, { "epoch": 0.5249437359339835, "grad_norm": 0.8579556345939636, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8397, "tokens_per_second_per_gpu": 10637.18, "total_tokens": 829028027 }, { "epoch": 0.5250062515628907, "grad_norm": 0.8779830932617188, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8398, "tokens_per_second_per_gpu": 11052.68, "total_tokens": 829128185 }, { "epoch": 0.525068767191798, "grad_norm": 0.9281665682792664, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8399, "tokens_per_second_per_gpu": 10048.11, "total_tokens": 829223505 }, { "epoch": 0.5251312828207052, "grad_norm": 0.9197819232940674, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8400, "tokens_per_second_per_gpu": 10240.69, "total_tokens": 829320450 }, { "epoch": 0.5251937984496124, "grad_norm": 0.9237115383148193, "learning_rate": 2e-05, "loss": 0.6876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8401, "tokens_per_second_per_gpu": 10530.21, "total_tokens": 829418412 }, { "epoch": 0.5252563140785196, "grad_norm": 0.8618923425674438, "learning_rate": 2e-05, "loss": 0.5973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8402, "tokens_per_second_per_gpu": 11041.7, "total_tokens": 829516729 }, { "epoch": 0.5253188297074268, "grad_norm": 0.8878052830696106, "learning_rate": 2e-05, "loss": 0.6687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8403, "tokens_per_second_per_gpu": 10973.38, "total_tokens": 829618973 }, { "epoch": 0.5253813453363341, "grad_norm": 0.8732693195343018, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8404, "tokens_per_second_per_gpu": 10432.77, "total_tokens": 829719328 }, { "epoch": 0.5254438609652413, "grad_norm": 0.917131781578064, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8405, "tokens_per_second_per_gpu": 10233.04, "total_tokens": 829817870 }, { "epoch": 0.5255063765941486, "grad_norm": 0.8737123608589172, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8406, "tokens_per_second_per_gpu": 10767.0, "total_tokens": 829918662 }, { "epoch": 0.5255688922230558, "grad_norm": 0.90995854139328, "learning_rate": 2e-05, "loss": 0.6473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8407, "tokens_per_second_per_gpu": 9697.77, "total_tokens": 830015313 }, { "epoch": 0.5256314078519629, "grad_norm": 0.9003249406814575, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8408, "tokens_per_second_per_gpu": 11245.67, "total_tokens": 830114835 }, { "epoch": 0.5256939234808702, "grad_norm": 0.8807108402252197, "learning_rate": 2e-05, "loss": 0.5992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8409, "tokens_per_second_per_gpu": 10960.69, "total_tokens": 830214303 }, { "epoch": 0.5257564391097774, "grad_norm": 0.8749895691871643, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8410, "tokens_per_second_per_gpu": 10918.79, "total_tokens": 830315109 }, { "epoch": 0.5258189547386847, "grad_norm": 0.8777596354484558, "learning_rate": 2e-05, "loss": 0.6647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8411, "tokens_per_second_per_gpu": 11037.01, "total_tokens": 830417335 }, { "epoch": 0.5258814703675919, "grad_norm": 0.8903509974479675, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8412, "tokens_per_second_per_gpu": 10463.42, "total_tokens": 830516011 }, { "epoch": 0.5259439859964992, "grad_norm": 0.8985741138458252, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8413, "tokens_per_second_per_gpu": 10692.23, "total_tokens": 830613629 }, { "epoch": 0.5260065016254063, "grad_norm": 0.8656763434410095, "learning_rate": 2e-05, "loss": 0.6237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8414, "tokens_per_second_per_gpu": 10500.76, "total_tokens": 830715551 }, { "epoch": 0.5260690172543135, "grad_norm": 0.8786933422088623, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8415, "tokens_per_second_per_gpu": 10454.88, "total_tokens": 830812502 }, { "epoch": 0.5261315328832208, "grad_norm": 0.8839007616043091, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8416, "tokens_per_second_per_gpu": 10307.85, "total_tokens": 830909492 }, { "epoch": 0.526194048512128, "grad_norm": 0.8938955068588257, "learning_rate": 2e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8417, "tokens_per_second_per_gpu": 10438.9, "total_tokens": 831006655 }, { "epoch": 0.5262565641410353, "grad_norm": 0.8999878764152527, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8418, "tokens_per_second_per_gpu": 10565.59, "total_tokens": 831109359 }, { "epoch": 0.5263190797699425, "grad_norm": 0.8525960445404053, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8419, "tokens_per_second_per_gpu": 10915.35, "total_tokens": 831211338 }, { "epoch": 0.5263815953988498, "grad_norm": 0.8725470900535583, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8420, "tokens_per_second_per_gpu": 10222.25, "total_tokens": 831307245 }, { "epoch": 0.5264441110277569, "grad_norm": 0.9049758911132812, "learning_rate": 2e-05, "loss": 0.6561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8421, "tokens_per_second_per_gpu": 10586.21, "total_tokens": 831404959 }, { "epoch": 0.5265066266566641, "grad_norm": 0.8609960079193115, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8422, "tokens_per_second_per_gpu": 10223.35, "total_tokens": 831500870 }, { "epoch": 0.5265691422855714, "grad_norm": 0.8850730061531067, "learning_rate": 2e-05, "loss": 0.6473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8423, "tokens_per_second_per_gpu": 10562.54, "total_tokens": 831597867 }, { "epoch": 0.5266316579144786, "grad_norm": 0.8618906140327454, "learning_rate": 2e-05, "loss": 0.6026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8424, "tokens_per_second_per_gpu": 10629.77, "total_tokens": 831692451 }, { "epoch": 0.5266941735433859, "grad_norm": 0.8795031309127808, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8425, "tokens_per_second_per_gpu": 10627.05, "total_tokens": 831788887 }, { "epoch": 0.5267566891722931, "grad_norm": 0.910893440246582, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8426, "tokens_per_second_per_gpu": 10932.21, "total_tokens": 831885040 }, { "epoch": 0.5268192048012003, "grad_norm": 0.8999918699264526, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8427, "tokens_per_second_per_gpu": 10581.9, "total_tokens": 831985642 }, { "epoch": 0.5268817204301075, "grad_norm": 0.911586582660675, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8428, "tokens_per_second_per_gpu": 9810.96, "total_tokens": 832079325 }, { "epoch": 0.5269442360590147, "grad_norm": 0.8914942741394043, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8429, "tokens_per_second_per_gpu": 9914.69, "total_tokens": 832171907 }, { "epoch": 0.527006751687922, "grad_norm": 0.8643706440925598, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8430, "tokens_per_second_per_gpu": 10042.45, "total_tokens": 832269478 }, { "epoch": 0.5270692673168292, "grad_norm": 0.8942422270774841, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8431, "tokens_per_second_per_gpu": 10231.27, "total_tokens": 832368116 }, { "epoch": 0.5271317829457365, "grad_norm": 0.8980981707572937, "learning_rate": 2e-05, "loss": 0.6286, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8432, "tokens_per_second_per_gpu": 10147.22, "total_tokens": 832461940 }, { "epoch": 0.5271942985746436, "grad_norm": 0.8607289791107178, "learning_rate": 2e-05, "loss": 0.6653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8433, "tokens_per_second_per_gpu": 11037.0, "total_tokens": 832566445 }, { "epoch": 0.5272568142035509, "grad_norm": 0.9045074582099915, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8434, "tokens_per_second_per_gpu": 9882.01, "total_tokens": 832665939 }, { "epoch": 0.5273193298324581, "grad_norm": 0.9631200432777405, "learning_rate": 2e-05, "loss": 0.6481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8435, "tokens_per_second_per_gpu": 9701.62, "total_tokens": 832761933 }, { "epoch": 0.5273818454613654, "grad_norm": 0.8824834823608398, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8436, "tokens_per_second_per_gpu": 11023.69, "total_tokens": 832861916 }, { "epoch": 0.5274443610902726, "grad_norm": 0.8877647519111633, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8437, "tokens_per_second_per_gpu": 10605.9, "total_tokens": 832961810 }, { "epoch": 0.5275068767191798, "grad_norm": 0.9059734344482422, "learning_rate": 2e-05, "loss": 0.6832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8438, "tokens_per_second_per_gpu": 10800.02, "total_tokens": 833063934 }, { "epoch": 0.527569392348087, "grad_norm": 0.9155082106590271, "learning_rate": 2e-05, "loss": 0.6856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8439, "tokens_per_second_per_gpu": 11205.92, "total_tokens": 833164938 }, { "epoch": 0.5276319079769942, "grad_norm": 0.9260783195495605, "learning_rate": 2e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8440, "tokens_per_second_per_gpu": 9987.68, "total_tokens": 833261403 }, { "epoch": 0.5276944236059015, "grad_norm": 0.9781543612480164, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8441, "tokens_per_second_per_gpu": 10181.35, "total_tokens": 833351262 }, { "epoch": 0.5277569392348087, "grad_norm": 0.9167926907539368, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8442, "tokens_per_second_per_gpu": 11646.8, "total_tokens": 833449384 }, { "epoch": 0.527819454863716, "grad_norm": 0.8970264196395874, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8443, "tokens_per_second_per_gpu": 10273.38, "total_tokens": 833544653 }, { "epoch": 0.5278819704926232, "grad_norm": 0.8897530436515808, "learning_rate": 2e-05, "loss": 0.6065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8444, "tokens_per_second_per_gpu": 10651.18, "total_tokens": 833643836 }, { "epoch": 0.5279444861215303, "grad_norm": 0.8855018019676208, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8445, "tokens_per_second_per_gpu": 10877.32, "total_tokens": 833744250 }, { "epoch": 0.5280070017504376, "grad_norm": 0.9442699551582336, "learning_rate": 2e-05, "loss": 0.6684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8446, "tokens_per_second_per_gpu": 10872.05, "total_tokens": 833850200 }, { "epoch": 0.5280695173793448, "grad_norm": 0.9163475632667542, "learning_rate": 2e-05, "loss": 0.6502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8447, "tokens_per_second_per_gpu": 9270.11, "total_tokens": 833943864 }, { "epoch": 0.5281320330082521, "grad_norm": 0.8825747966766357, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8448, "tokens_per_second_per_gpu": 9965.84, "total_tokens": 834040681 }, { "epoch": 0.5281945486371593, "grad_norm": 0.9506046175956726, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8449, "tokens_per_second_per_gpu": 9903.33, "total_tokens": 834140407 }, { "epoch": 0.5282570642660666, "grad_norm": 0.8735379576683044, "learning_rate": 2e-05, "loss": 0.6002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8450, "tokens_per_second_per_gpu": 10504.58, "total_tokens": 834238384 }, { "epoch": 0.5283195798949737, "grad_norm": 0.8884105086326599, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8451, "tokens_per_second_per_gpu": 10743.99, "total_tokens": 834338402 }, { "epoch": 0.5283820955238809, "grad_norm": 0.9507166743278503, "learning_rate": 2e-05, "loss": 0.6458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8452, "tokens_per_second_per_gpu": 9871.62, "total_tokens": 834430784 }, { "epoch": 0.5284446111527882, "grad_norm": 0.8918544054031372, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8453, "tokens_per_second_per_gpu": 10886.65, "total_tokens": 834530835 }, { "epoch": 0.5285071267816954, "grad_norm": 0.8977218270301819, "learning_rate": 2e-05, "loss": 0.5925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8454, "tokens_per_second_per_gpu": 10229.24, "total_tokens": 834628823 }, { "epoch": 0.5285696424106027, "grad_norm": 0.8751914501190186, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8455, "tokens_per_second_per_gpu": 10227.9, "total_tokens": 834726960 }, { "epoch": 0.5286321580395099, "grad_norm": 0.8986554741859436, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8456, "tokens_per_second_per_gpu": 11064.06, "total_tokens": 834828763 }, { "epoch": 0.5286946736684172, "grad_norm": 0.8929587602615356, "learning_rate": 2e-05, "loss": 0.6572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8457, "tokens_per_second_per_gpu": 10751.39, "total_tokens": 834928096 }, { "epoch": 0.5287571892973243, "grad_norm": 0.8657926917076111, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8458, "tokens_per_second_per_gpu": 10442.38, "total_tokens": 835024131 }, { "epoch": 0.5288197049262315, "grad_norm": 0.9003428816795349, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8459, "tokens_per_second_per_gpu": 10526.37, "total_tokens": 835121575 }, { "epoch": 0.5288822205551388, "grad_norm": 0.9035148620605469, "learning_rate": 2e-05, "loss": 0.6688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8460, "tokens_per_second_per_gpu": 10336.0, "total_tokens": 835218604 }, { "epoch": 0.528944736184046, "grad_norm": 0.9018093347549438, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8461, "tokens_per_second_per_gpu": 11263.52, "total_tokens": 835311012 }, { "epoch": 0.5290072518129533, "grad_norm": 0.8862147331237793, "learning_rate": 2e-05, "loss": 0.6551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8462, "tokens_per_second_per_gpu": 11199.52, "total_tokens": 835414082 }, { "epoch": 0.5290697674418605, "grad_norm": 0.8599522709846497, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8463, "tokens_per_second_per_gpu": 10113.18, "total_tokens": 835512486 }, { "epoch": 0.5291322830707677, "grad_norm": 0.9041678309440613, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8464, "tokens_per_second_per_gpu": 11112.99, "total_tokens": 835610091 }, { "epoch": 0.5291947986996749, "grad_norm": 0.8521416187286377, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8465, "tokens_per_second_per_gpu": 10920.98, "total_tokens": 835711019 }, { "epoch": 0.5292573143285821, "grad_norm": 0.8516486287117004, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8466, "tokens_per_second_per_gpu": 11303.41, "total_tokens": 835814940 }, { "epoch": 0.5293198299574894, "grad_norm": 0.9098033308982849, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8467, "tokens_per_second_per_gpu": 10272.45, "total_tokens": 835914458 }, { "epoch": 0.5293823455863966, "grad_norm": 0.8692653775215149, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8468, "tokens_per_second_per_gpu": 10030.32, "total_tokens": 836013376 }, { "epoch": 0.5294448612153039, "grad_norm": 0.8848720788955688, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8469, "tokens_per_second_per_gpu": 10938.91, "total_tokens": 836113586 }, { "epoch": 0.529507376844211, "grad_norm": 0.9504009485244751, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8470, "tokens_per_second_per_gpu": 10606.17, "total_tokens": 836212074 }, { "epoch": 0.5295698924731183, "grad_norm": 0.9179167747497559, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8471, "tokens_per_second_per_gpu": 10417.93, "total_tokens": 836310482 }, { "epoch": 0.5296324081020255, "grad_norm": 0.8680469393730164, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8472, "tokens_per_second_per_gpu": 10694.56, "total_tokens": 836409918 }, { "epoch": 0.5296949237309327, "grad_norm": 0.9051601886749268, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8473, "tokens_per_second_per_gpu": 10375.76, "total_tokens": 836510883 }, { "epoch": 0.52975743935984, "grad_norm": 0.8860835433006287, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8474, "tokens_per_second_per_gpu": 10143.59, "total_tokens": 836609773 }, { "epoch": 0.5298199549887472, "grad_norm": 0.8490163683891296, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8475, "tokens_per_second_per_gpu": 10845.13, "total_tokens": 836710272 }, { "epoch": 0.5298824706176544, "grad_norm": 0.8801975846290588, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8476, "tokens_per_second_per_gpu": 10406.13, "total_tokens": 836810777 }, { "epoch": 0.5299449862465616, "grad_norm": 0.9301796555519104, "learning_rate": 2e-05, "loss": 0.6356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8477, "tokens_per_second_per_gpu": 9736.28, "total_tokens": 836904058 }, { "epoch": 0.5300075018754689, "grad_norm": 0.8989614248275757, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8478, "tokens_per_second_per_gpu": 10291.4, "total_tokens": 837004265 }, { "epoch": 0.5300700175043761, "grad_norm": 0.8495922088623047, "learning_rate": 2e-05, "loss": 0.6146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8479, "tokens_per_second_per_gpu": 10194.92, "total_tokens": 837102853 }, { "epoch": 0.5301325331332833, "grad_norm": 0.8633130788803101, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8480, "tokens_per_second_per_gpu": 10624.32, "total_tokens": 837201950 }, { "epoch": 0.5301950487621906, "grad_norm": 0.8521500825881958, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8481, "tokens_per_second_per_gpu": 10770.58, "total_tokens": 837304883 }, { "epoch": 0.5302575643910977, "grad_norm": 0.9093846082687378, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8482, "tokens_per_second_per_gpu": 10566.64, "total_tokens": 837405774 }, { "epoch": 0.530320080020005, "grad_norm": 0.8834507465362549, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8483, "tokens_per_second_per_gpu": 10778.89, "total_tokens": 837502824 }, { "epoch": 0.5303825956489122, "grad_norm": 0.8821808695793152, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8484, "tokens_per_second_per_gpu": 9704.06, "total_tokens": 837597237 }, { "epoch": 0.5304451112778195, "grad_norm": 0.9125723242759705, "learning_rate": 2e-05, "loss": 0.6455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8485, "tokens_per_second_per_gpu": 10930.18, "total_tokens": 837696031 }, { "epoch": 0.5305076269067267, "grad_norm": 0.9425255060195923, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8486, "tokens_per_second_per_gpu": 10362.27, "total_tokens": 837796020 }, { "epoch": 0.530570142535634, "grad_norm": 0.9204962849617004, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8487, "tokens_per_second_per_gpu": 10297.05, "total_tokens": 837892262 }, { "epoch": 0.5306326581645411, "grad_norm": 0.8764106035232544, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8488, "tokens_per_second_per_gpu": 10879.87, "total_tokens": 837991819 }, { "epoch": 0.5306951737934483, "grad_norm": 0.8916985988616943, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8489, "tokens_per_second_per_gpu": 10410.63, "total_tokens": 838092812 }, { "epoch": 0.5307576894223556, "grad_norm": 0.8747442364692688, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8490, "tokens_per_second_per_gpu": 11070.0, "total_tokens": 838194666 }, { "epoch": 0.5308202050512628, "grad_norm": 0.9038734436035156, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8491, "tokens_per_second_per_gpu": 9710.83, "total_tokens": 838292554 }, { "epoch": 0.5308827206801701, "grad_norm": 0.8957813382148743, "learning_rate": 2e-05, "loss": 0.5986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8492, "tokens_per_second_per_gpu": 10990.89, "total_tokens": 838391178 }, { "epoch": 0.5309452363090773, "grad_norm": 0.888709306716919, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8493, "tokens_per_second_per_gpu": 10588.27, "total_tokens": 838490653 }, { "epoch": 0.5310077519379846, "grad_norm": 0.9327101707458496, "learning_rate": 2e-05, "loss": 0.692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8494, "tokens_per_second_per_gpu": 10055.9, "total_tokens": 838591145 }, { "epoch": 0.5310702675668917, "grad_norm": 0.9048532843589783, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8495, "tokens_per_second_per_gpu": 11050.48, "total_tokens": 838687122 }, { "epoch": 0.5311327831957989, "grad_norm": 0.8921748995780945, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8496, "tokens_per_second_per_gpu": 9837.27, "total_tokens": 838780161 }, { "epoch": 0.5311952988247062, "grad_norm": 0.8746699094772339, "learning_rate": 2e-05, "loss": 0.603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8497, "tokens_per_second_per_gpu": 10700.66, "total_tokens": 838877696 }, { "epoch": 0.5312578144536134, "grad_norm": 0.8884233832359314, "learning_rate": 2e-05, "loss": 0.5798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8498, "tokens_per_second_per_gpu": 9976.09, "total_tokens": 838975759 }, { "epoch": 0.5313203300825207, "grad_norm": 0.8801807165145874, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8499, "tokens_per_second_per_gpu": 11141.13, "total_tokens": 839075700 }, { "epoch": 0.5313828457114279, "grad_norm": 0.8935195207595825, "learning_rate": 2e-05, "loss": 0.6428, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8500, "tokens_per_second_per_gpu": 10699.62, "total_tokens": 839175808 }, { "epoch": 0.531445361340335, "grad_norm": 0.8774140477180481, "learning_rate": 2e-05, "loss": 0.6776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8501, "tokens_per_second_per_gpu": 11113.5, "total_tokens": 839277885 }, { "epoch": 0.5315078769692423, "grad_norm": 0.9218875169754028, "learning_rate": 2e-05, "loss": 0.6, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8502, "tokens_per_second_per_gpu": 10226.89, "total_tokens": 839375300 }, { "epoch": 0.5315703925981495, "grad_norm": 0.8639991283416748, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8503, "tokens_per_second_per_gpu": 10696.81, "total_tokens": 839475660 }, { "epoch": 0.5316329082270568, "grad_norm": 0.883733332157135, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8504, "tokens_per_second_per_gpu": 10284.04, "total_tokens": 839572845 }, { "epoch": 0.531695423855964, "grad_norm": 0.8732811808586121, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8505, "tokens_per_second_per_gpu": 10794.15, "total_tokens": 839671896 }, { "epoch": 0.5317579394848713, "grad_norm": 0.890543520450592, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8506, "tokens_per_second_per_gpu": 10434.86, "total_tokens": 839770979 }, { "epoch": 0.5318204551137784, "grad_norm": 0.9001742005348206, "learning_rate": 2e-05, "loss": 0.6244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8507, "tokens_per_second_per_gpu": 10622.3, "total_tokens": 839870925 }, { "epoch": 0.5318829707426856, "grad_norm": 0.9108903408050537, "learning_rate": 2e-05, "loss": 0.6725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8508, "tokens_per_second_per_gpu": 11073.58, "total_tokens": 839971224 }, { "epoch": 0.5319454863715929, "grad_norm": 0.8891245722770691, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8509, "tokens_per_second_per_gpu": 10303.67, "total_tokens": 840071295 }, { "epoch": 0.5320080020005001, "grad_norm": 0.8602601289749146, "learning_rate": 2e-05, "loss": 0.5806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8510, "tokens_per_second_per_gpu": 10271.85, "total_tokens": 840166734 }, { "epoch": 0.5320705176294074, "grad_norm": 0.9339329600334167, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8511, "tokens_per_second_per_gpu": 10386.16, "total_tokens": 840265892 }, { "epoch": 0.5321330332583146, "grad_norm": 0.9191851019859314, "learning_rate": 2e-05, "loss": 0.6036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8512, "tokens_per_second_per_gpu": 10116.35, "total_tokens": 840359414 }, { "epoch": 0.5321955488872218, "grad_norm": 0.9094982147216797, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8513, "tokens_per_second_per_gpu": 10946.97, "total_tokens": 840461791 }, { "epoch": 0.532258064516129, "grad_norm": 0.9041308164596558, "learning_rate": 2e-05, "loss": 0.6743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8514, "tokens_per_second_per_gpu": 10931.05, "total_tokens": 840561423 }, { "epoch": 0.5323205801450362, "grad_norm": 0.8702443242073059, "learning_rate": 2e-05, "loss": 0.6255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8515, "tokens_per_second_per_gpu": 10158.33, "total_tokens": 840660259 }, { "epoch": 0.5323830957739435, "grad_norm": 0.8771026134490967, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8516, "tokens_per_second_per_gpu": 10637.56, "total_tokens": 840760012 }, { "epoch": 0.5324456114028507, "grad_norm": 0.9017900228500366, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8517, "tokens_per_second_per_gpu": 10395.08, "total_tokens": 840857107 }, { "epoch": 0.532508127031758, "grad_norm": 0.8683598637580872, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8518, "tokens_per_second_per_gpu": 11081.12, "total_tokens": 840957757 }, { "epoch": 0.5325706426606651, "grad_norm": 0.8581838011741638, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8519, "tokens_per_second_per_gpu": 10659.78, "total_tokens": 841055599 }, { "epoch": 0.5326331582895724, "grad_norm": 0.8690276145935059, "learning_rate": 2e-05, "loss": 0.6244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8520, "tokens_per_second_per_gpu": 10625.51, "total_tokens": 841154092 }, { "epoch": 0.5326956739184796, "grad_norm": 0.861362636089325, "learning_rate": 2e-05, "loss": 0.6634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8521, "tokens_per_second_per_gpu": 10114.86, "total_tokens": 841254240 }, { "epoch": 0.5327581895473869, "grad_norm": 0.9080403447151184, "learning_rate": 2e-05, "loss": 0.6018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8522, "tokens_per_second_per_gpu": 10259.33, "total_tokens": 841350898 }, { "epoch": 0.5328207051762941, "grad_norm": 0.8852992653846741, "learning_rate": 2e-05, "loss": 0.6146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8523, "tokens_per_second_per_gpu": 10483.3, "total_tokens": 841449769 }, { "epoch": 0.5328832208052013, "grad_norm": 0.8726316094398499, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8524, "tokens_per_second_per_gpu": 10501.16, "total_tokens": 841546813 }, { "epoch": 0.5329457364341085, "grad_norm": 0.876846194267273, "learning_rate": 2e-05, "loss": 0.6741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8525, "tokens_per_second_per_gpu": 11401.32, "total_tokens": 841649844 }, { "epoch": 0.5330082520630157, "grad_norm": 0.8534049391746521, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8526, "tokens_per_second_per_gpu": 11169.81, "total_tokens": 841752320 }, { "epoch": 0.533070767691923, "grad_norm": 0.8690082430839539, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8527, "tokens_per_second_per_gpu": 11458.64, "total_tokens": 841854841 }, { "epoch": 0.5331332833208302, "grad_norm": 0.9120599031448364, "learning_rate": 2e-05, "loss": 0.6564, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8528, "tokens_per_second_per_gpu": 10542.68, "total_tokens": 841951752 }, { "epoch": 0.5331957989497375, "grad_norm": 0.8658961653709412, "learning_rate": 2e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8529, "tokens_per_second_per_gpu": 9799.31, "total_tokens": 842045795 }, { "epoch": 0.5332583145786447, "grad_norm": 0.8782665133476257, "learning_rate": 2e-05, "loss": 0.6219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8530, "tokens_per_second_per_gpu": 10875.55, "total_tokens": 842145686 }, { "epoch": 0.5333208302075518, "grad_norm": 0.8962684869766235, "learning_rate": 2e-05, "loss": 0.657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8531, "tokens_per_second_per_gpu": 10719.03, "total_tokens": 842244416 }, { "epoch": 0.5333833458364591, "grad_norm": 0.8667622208595276, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8532, "tokens_per_second_per_gpu": 11309.89, "total_tokens": 842348963 }, { "epoch": 0.5334458614653663, "grad_norm": 0.8651870489120483, "learning_rate": 2e-05, "loss": 0.6814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8533, "tokens_per_second_per_gpu": 10368.17, "total_tokens": 842449767 }, { "epoch": 0.5335083770942736, "grad_norm": 0.8700074553489685, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8534, "tokens_per_second_per_gpu": 10612.81, "total_tokens": 842550662 }, { "epoch": 0.5335708927231808, "grad_norm": 0.8801368474960327, "learning_rate": 2e-05, "loss": 0.6723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8535, "tokens_per_second_per_gpu": 10884.33, "total_tokens": 842652879 }, { "epoch": 0.5336334083520881, "grad_norm": 0.9215044379234314, "learning_rate": 2e-05, "loss": 0.6756, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8536, "tokens_per_second_per_gpu": 10793.18, "total_tokens": 842750497 }, { "epoch": 0.5336959239809953, "grad_norm": 0.8620269298553467, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8537, "tokens_per_second_per_gpu": 10717.75, "total_tokens": 842852757 }, { "epoch": 0.5337584396099024, "grad_norm": 0.8812929391860962, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8538, "tokens_per_second_per_gpu": 9127.47, "total_tokens": 842946526 }, { "epoch": 0.5338209552388097, "grad_norm": 0.9026370048522949, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8539, "tokens_per_second_per_gpu": 10035.92, "total_tokens": 843043545 }, { "epoch": 0.5338834708677169, "grad_norm": 0.8544538617134094, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8540, "tokens_per_second_per_gpu": 11489.95, "total_tokens": 843147076 }, { "epoch": 0.5339459864966242, "grad_norm": 0.8784525990486145, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8541, "tokens_per_second_per_gpu": 10994.74, "total_tokens": 843250178 }, { "epoch": 0.5340085021255314, "grad_norm": 0.8816508054733276, "learning_rate": 2e-05, "loss": 0.6526, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8542, "tokens_per_second_per_gpu": 10026.03, "total_tokens": 843345933 }, { "epoch": 0.5340710177544387, "grad_norm": 0.8553927540779114, "learning_rate": 2e-05, "loss": 0.593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8543, "tokens_per_second_per_gpu": 11147.93, "total_tokens": 843446128 }, { "epoch": 0.5341335333833458, "grad_norm": 0.9040263295173645, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8544, "tokens_per_second_per_gpu": 10169.42, "total_tokens": 843541519 }, { "epoch": 0.534196049012253, "grad_norm": 0.8877305388450623, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8545, "tokens_per_second_per_gpu": 10513.33, "total_tokens": 843642333 }, { "epoch": 0.5342585646411603, "grad_norm": 0.8988702893257141, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8546, "tokens_per_second_per_gpu": 10674.32, "total_tokens": 843743236 }, { "epoch": 0.5343210802700675, "grad_norm": 0.9060669541358948, "learning_rate": 2e-05, "loss": 0.6717, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8547, "tokens_per_second_per_gpu": 10964.17, "total_tokens": 843840173 }, { "epoch": 0.5343835958989748, "grad_norm": 0.8898111581802368, "learning_rate": 2e-05, "loss": 0.6013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8548, "tokens_per_second_per_gpu": 10335.77, "total_tokens": 843937147 }, { "epoch": 0.534446111527882, "grad_norm": 0.8558681607246399, "learning_rate": 2e-05, "loss": 0.6575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8549, "tokens_per_second_per_gpu": 11311.64, "total_tokens": 844041009 }, { "epoch": 0.5345086271567892, "grad_norm": 0.9163453578948975, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8550, "tokens_per_second_per_gpu": 11276.32, "total_tokens": 844141935 }, { "epoch": 0.5345711427856964, "grad_norm": 0.8840394020080566, "learning_rate": 2e-05, "loss": 0.6145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8551, "tokens_per_second_per_gpu": 10848.09, "total_tokens": 844242455 }, { "epoch": 0.5346336584146036, "grad_norm": 0.8851736783981323, "learning_rate": 2e-05, "loss": 0.6535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8552, "tokens_per_second_per_gpu": 10622.78, "total_tokens": 844344454 }, { "epoch": 0.5346961740435109, "grad_norm": 0.8635088801383972, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8553, "tokens_per_second_per_gpu": 11355.34, "total_tokens": 844445209 }, { "epoch": 0.5347586896724181, "grad_norm": 0.959980845451355, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8554, "tokens_per_second_per_gpu": 10757.84, "total_tokens": 844545460 }, { "epoch": 0.5348212053013254, "grad_norm": 0.8825486898422241, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8555, "tokens_per_second_per_gpu": 10547.5, "total_tokens": 844647748 }, { "epoch": 0.5348837209302325, "grad_norm": 0.8609839081764221, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8556, "tokens_per_second_per_gpu": 10841.05, "total_tokens": 844747463 }, { "epoch": 0.5349462365591398, "grad_norm": 0.944905698299408, "learning_rate": 2e-05, "loss": 0.698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8557, "tokens_per_second_per_gpu": 10403.91, "total_tokens": 844846740 }, { "epoch": 0.535008752188047, "grad_norm": 0.875598132610321, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8558, "tokens_per_second_per_gpu": 10965.26, "total_tokens": 844946127 }, { "epoch": 0.5350712678169542, "grad_norm": 0.8760350942611694, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8559, "tokens_per_second_per_gpu": 11222.48, "total_tokens": 845047848 }, { "epoch": 0.5351337834458615, "grad_norm": 0.8902546167373657, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8560, "tokens_per_second_per_gpu": 10565.08, "total_tokens": 845147063 }, { "epoch": 0.5351962990747687, "grad_norm": 0.9473203420639038, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8561, "tokens_per_second_per_gpu": 12099.31, "total_tokens": 845238924 }, { "epoch": 0.5352588147036759, "grad_norm": 0.9056826233863831, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8562, "tokens_per_second_per_gpu": 10604.51, "total_tokens": 845335676 }, { "epoch": 0.5353213303325831, "grad_norm": 0.8797902464866638, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8563, "tokens_per_second_per_gpu": 10757.8, "total_tokens": 845435464 }, { "epoch": 0.5353838459614904, "grad_norm": 0.904388427734375, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8564, "tokens_per_second_per_gpu": 10772.35, "total_tokens": 845535788 }, { "epoch": 0.5354463615903976, "grad_norm": 0.9101340770721436, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8565, "tokens_per_second_per_gpu": 10871.09, "total_tokens": 845632389 }, { "epoch": 0.5355088772193048, "grad_norm": 0.8810776472091675, "learning_rate": 2e-05, "loss": 0.5669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8566, "tokens_per_second_per_gpu": 10323.08, "total_tokens": 845721726 }, { "epoch": 0.5355713928482121, "grad_norm": 0.8489235639572144, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8567, "tokens_per_second_per_gpu": 10580.28, "total_tokens": 845822998 }, { "epoch": 0.5356339084771192, "grad_norm": 0.8546453714370728, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8568, "tokens_per_second_per_gpu": 10573.26, "total_tokens": 845923407 }, { "epoch": 0.5356964241060265, "grad_norm": 0.9002196192741394, "learning_rate": 2e-05, "loss": 0.6766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8569, "tokens_per_second_per_gpu": 10722.24, "total_tokens": 846025817 }, { "epoch": 0.5357589397349337, "grad_norm": 0.8949682116508484, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8570, "tokens_per_second_per_gpu": 10279.65, "total_tokens": 846125710 }, { "epoch": 0.535821455363841, "grad_norm": 0.8721354007720947, "learning_rate": 2e-05, "loss": 0.6552, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8571, "tokens_per_second_per_gpu": 11189.83, "total_tokens": 846225525 }, { "epoch": 0.5358839709927482, "grad_norm": 0.8443442583084106, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8572, "tokens_per_second_per_gpu": 10957.78, "total_tokens": 846327090 }, { "epoch": 0.5359464866216554, "grad_norm": 1.029628038406372, "learning_rate": 2e-05, "loss": 0.6413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8573, "tokens_per_second_per_gpu": 10181.31, "total_tokens": 846423228 }, { "epoch": 0.5360090022505627, "grad_norm": 0.9020698070526123, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8574, "tokens_per_second_per_gpu": 10073.94, "total_tokens": 846517531 }, { "epoch": 0.5360715178794698, "grad_norm": 0.8901268243789673, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8575, "tokens_per_second_per_gpu": 10617.36, "total_tokens": 846617205 }, { "epoch": 0.5361340335083771, "grad_norm": 0.8968618512153625, "learning_rate": 2e-05, "loss": 0.5812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8576, "tokens_per_second_per_gpu": 9450.7, "total_tokens": 846710308 }, { "epoch": 0.5361965491372843, "grad_norm": 0.8930585384368896, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8577, "tokens_per_second_per_gpu": 10736.5, "total_tokens": 846810581 }, { "epoch": 0.5362590647661916, "grad_norm": 0.9148156046867371, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8578, "tokens_per_second_per_gpu": 10036.3, "total_tokens": 846902082 }, { "epoch": 0.5363215803950988, "grad_norm": 0.8827527165412903, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8579, "tokens_per_second_per_gpu": 10457.15, "total_tokens": 846999452 }, { "epoch": 0.536384096024006, "grad_norm": 0.9005237817764282, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8580, "tokens_per_second_per_gpu": 10895.25, "total_tokens": 847099074 }, { "epoch": 0.5364466116529132, "grad_norm": 0.9123246669769287, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8581, "tokens_per_second_per_gpu": 10719.74, "total_tokens": 847198275 }, { "epoch": 0.5365091272818204, "grad_norm": 0.8875145316123962, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8582, "tokens_per_second_per_gpu": 10299.8, "total_tokens": 847296586 }, { "epoch": 0.5365716429107277, "grad_norm": 0.9305289387702942, "learning_rate": 2e-05, "loss": 0.6522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8583, "tokens_per_second_per_gpu": 11075.03, "total_tokens": 847396696 }, { "epoch": 0.5366341585396349, "grad_norm": 0.8323707580566406, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8584, "tokens_per_second_per_gpu": 10820.3, "total_tokens": 847496779 }, { "epoch": 0.5366966741685422, "grad_norm": 0.8819716572761536, "learning_rate": 2e-05, "loss": 0.5999, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8585, "tokens_per_second_per_gpu": 11227.27, "total_tokens": 847599606 }, { "epoch": 0.5367591897974494, "grad_norm": 0.9135024547576904, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8586, "tokens_per_second_per_gpu": 10292.27, "total_tokens": 847697382 }, { "epoch": 0.5368217054263565, "grad_norm": 0.9124364256858826, "learning_rate": 2e-05, "loss": 0.6566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8587, "tokens_per_second_per_gpu": 10660.34, "total_tokens": 847798423 }, { "epoch": 0.5368842210552638, "grad_norm": 0.9340257048606873, "learning_rate": 2e-05, "loss": 0.652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8588, "tokens_per_second_per_gpu": 9521.4, "total_tokens": 847891029 }, { "epoch": 0.536946736684171, "grad_norm": 0.9072908759117126, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8589, "tokens_per_second_per_gpu": 10455.41, "total_tokens": 847988674 }, { "epoch": 0.5370092523130783, "grad_norm": 0.8869579434394836, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8590, "tokens_per_second_per_gpu": 10529.46, "total_tokens": 848087324 }, { "epoch": 0.5370717679419855, "grad_norm": 0.9024474024772644, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8591, "tokens_per_second_per_gpu": 10335.76, "total_tokens": 848184862 }, { "epoch": 0.5371342835708928, "grad_norm": 0.895251989364624, "learning_rate": 2e-05, "loss": 0.6718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8592, "tokens_per_second_per_gpu": 10647.08, "total_tokens": 848285460 }, { "epoch": 0.5371967991997999, "grad_norm": 0.9068478345870972, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8593, "tokens_per_second_per_gpu": 10022.43, "total_tokens": 848386710 }, { "epoch": 0.5372593148287071, "grad_norm": 0.9324324727058411, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8594, "tokens_per_second_per_gpu": 11078.34, "total_tokens": 848489500 }, { "epoch": 0.5373218304576144, "grad_norm": 0.8738628625869751, "learning_rate": 2e-05, "loss": 0.6829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8595, "tokens_per_second_per_gpu": 11424.42, "total_tokens": 848597242 }, { "epoch": 0.5373843460865216, "grad_norm": 0.8957762122154236, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8596, "tokens_per_second_per_gpu": 9928.94, "total_tokens": 848693247 }, { "epoch": 0.5374468617154289, "grad_norm": 0.9043852686882019, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8597, "tokens_per_second_per_gpu": 10232.28, "total_tokens": 848790710 }, { "epoch": 0.5375093773443361, "grad_norm": 0.9330704212188721, "learning_rate": 2e-05, "loss": 0.6742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8598, "tokens_per_second_per_gpu": 10502.55, "total_tokens": 848890242 }, { "epoch": 0.5375718929732433, "grad_norm": 0.9475247859954834, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8599, "tokens_per_second_per_gpu": 9954.27, "total_tokens": 848986450 }, { "epoch": 0.5376344086021505, "grad_norm": 0.8892850875854492, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8600, "tokens_per_second_per_gpu": 10853.6, "total_tokens": 849088196 }, { "epoch": 0.5376969242310577, "grad_norm": 0.8806895017623901, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8601, "tokens_per_second_per_gpu": 9600.76, "total_tokens": 849185421 }, { "epoch": 0.537759439859965, "grad_norm": 0.8885801434516907, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8602, "tokens_per_second_per_gpu": 10416.11, "total_tokens": 849284232 }, { "epoch": 0.5378219554888722, "grad_norm": 0.9012125730514526, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8603, "tokens_per_second_per_gpu": 11112.77, "total_tokens": 849386935 }, { "epoch": 0.5378844711177795, "grad_norm": 0.9023561477661133, "learning_rate": 2e-05, "loss": 0.6461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8604, "tokens_per_second_per_gpu": 9642.57, "total_tokens": 849485878 }, { "epoch": 0.5379469867466866, "grad_norm": 0.9788317680358887, "learning_rate": 2e-05, "loss": 0.66, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8605, "tokens_per_second_per_gpu": 10755.07, "total_tokens": 849588429 }, { "epoch": 0.5380095023755939, "grad_norm": 0.8689116835594177, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8606, "tokens_per_second_per_gpu": 10578.98, "total_tokens": 849686314 }, { "epoch": 0.5380720180045011, "grad_norm": 0.8996153473854065, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8607, "tokens_per_second_per_gpu": 10066.85, "total_tokens": 849782530 }, { "epoch": 0.5381345336334084, "grad_norm": 0.8963978290557861, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8608, "tokens_per_second_per_gpu": 10352.58, "total_tokens": 849881801 }, { "epoch": 0.5381970492623156, "grad_norm": 0.8830629587173462, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8609, "tokens_per_second_per_gpu": 10316.52, "total_tokens": 849977528 }, { "epoch": 0.5382595648912228, "grad_norm": 0.8763728737831116, "learning_rate": 2e-05, "loss": 0.6209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8610, "tokens_per_second_per_gpu": 10925.58, "total_tokens": 850075483 }, { "epoch": 0.5383220805201301, "grad_norm": 0.8738952875137329, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8611, "tokens_per_second_per_gpu": 10668.9, "total_tokens": 850173994 }, { "epoch": 0.5383845961490372, "grad_norm": 0.8707404136657715, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8612, "tokens_per_second_per_gpu": 10016.73, "total_tokens": 850269545 }, { "epoch": 0.5384471117779445, "grad_norm": 0.9153876900672913, "learning_rate": 2e-05, "loss": 0.6592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8613, "tokens_per_second_per_gpu": 10322.7, "total_tokens": 850368280 }, { "epoch": 0.5385096274068517, "grad_norm": 0.8749470710754395, "learning_rate": 2e-05, "loss": 0.6502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8614, "tokens_per_second_per_gpu": 10503.21, "total_tokens": 850468789 }, { "epoch": 0.538572143035759, "grad_norm": 0.9029932618141174, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8615, "tokens_per_second_per_gpu": 11078.42, "total_tokens": 850571706 }, { "epoch": 0.5386346586646662, "grad_norm": 0.8761382699012756, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8616, "tokens_per_second_per_gpu": 10383.1, "total_tokens": 850674274 }, { "epoch": 0.5386971742935734, "grad_norm": 0.8729599714279175, "learning_rate": 2e-05, "loss": 0.5995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8617, "tokens_per_second_per_gpu": 10178.65, "total_tokens": 850771974 }, { "epoch": 0.5387596899224806, "grad_norm": 0.9283816814422607, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8618, "tokens_per_second_per_gpu": 9583.78, "total_tokens": 850867279 }, { "epoch": 0.5388222055513878, "grad_norm": 0.8943790197372437, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8619, "tokens_per_second_per_gpu": 10291.62, "total_tokens": 850966141 }, { "epoch": 0.5388847211802951, "grad_norm": 0.8821471333503723, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8620, "tokens_per_second_per_gpu": 10384.31, "total_tokens": 851066636 }, { "epoch": 0.5389472368092023, "grad_norm": 0.8852878212928772, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8621, "tokens_per_second_per_gpu": 10685.04, "total_tokens": 851166937 }, { "epoch": 0.5390097524381096, "grad_norm": 0.8995353579521179, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8622, "tokens_per_second_per_gpu": 9521.45, "total_tokens": 851264283 }, { "epoch": 0.5390722680670168, "grad_norm": 0.8857541084289551, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8623, "tokens_per_second_per_gpu": 10602.9, "total_tokens": 851366060 }, { "epoch": 0.5391347836959239, "grad_norm": 0.8868095874786377, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8624, "tokens_per_second_per_gpu": 11294.75, "total_tokens": 851467417 }, { "epoch": 0.5391972993248312, "grad_norm": 0.8984832167625427, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8625, "tokens_per_second_per_gpu": 10420.82, "total_tokens": 851562257 }, { "epoch": 0.5392598149537384, "grad_norm": 0.8854532241821289, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8626, "tokens_per_second_per_gpu": 10398.17, "total_tokens": 851661471 }, { "epoch": 0.5393223305826457, "grad_norm": 0.8685122132301331, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8627, "tokens_per_second_per_gpu": 10383.62, "total_tokens": 851762251 }, { "epoch": 0.5393848462115529, "grad_norm": 0.9022653102874756, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8628, "tokens_per_second_per_gpu": 10159.2, "total_tokens": 851860231 }, { "epoch": 0.5394473618404602, "grad_norm": 0.9315930604934692, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8629, "tokens_per_second_per_gpu": 10624.3, "total_tokens": 851957291 }, { "epoch": 0.5395098774693673, "grad_norm": 0.8627650737762451, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8630, "tokens_per_second_per_gpu": 10731.33, "total_tokens": 852059568 }, { "epoch": 0.5395723930982745, "grad_norm": 0.8846077919006348, "learning_rate": 2e-05, "loss": 0.594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8631, "tokens_per_second_per_gpu": 9647.77, "total_tokens": 852152149 }, { "epoch": 0.5396349087271818, "grad_norm": 0.9317128658294678, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8632, "tokens_per_second_per_gpu": 10540.02, "total_tokens": 852248554 }, { "epoch": 0.539697424356089, "grad_norm": 0.898109495639801, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8633, "tokens_per_second_per_gpu": 10615.11, "total_tokens": 852351327 }, { "epoch": 0.5397599399849963, "grad_norm": 0.9358903765678406, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8634, "tokens_per_second_per_gpu": 10239.42, "total_tokens": 852448274 }, { "epoch": 0.5398224556139035, "grad_norm": 0.8811209201812744, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8635, "tokens_per_second_per_gpu": 10771.62, "total_tokens": 852548270 }, { "epoch": 0.5398849712428107, "grad_norm": 0.9015743136405945, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8636, "tokens_per_second_per_gpu": 10452.31, "total_tokens": 852647601 }, { "epoch": 0.5399474868717179, "grad_norm": 0.861785888671875, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8637, "tokens_per_second_per_gpu": 10667.05, "total_tokens": 852747517 }, { "epoch": 0.5400100025006251, "grad_norm": 0.8946176171302795, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8638, "tokens_per_second_per_gpu": 10368.49, "total_tokens": 852845591 }, { "epoch": 0.5400725181295324, "grad_norm": 0.9840120077133179, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8639, "tokens_per_second_per_gpu": 10586.08, "total_tokens": 852936525 }, { "epoch": 0.5401350337584396, "grad_norm": 0.8775882720947266, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8640, "tokens_per_second_per_gpu": 10599.54, "total_tokens": 853038137 }, { "epoch": 0.5401975493873469, "grad_norm": 0.8792517185211182, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8641, "tokens_per_second_per_gpu": 10547.13, "total_tokens": 853139312 }, { "epoch": 0.540260065016254, "grad_norm": 0.8829768300056458, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8642, "tokens_per_second_per_gpu": 11034.46, "total_tokens": 853239221 }, { "epoch": 0.5403225806451613, "grad_norm": 0.8825163841247559, "learning_rate": 2e-05, "loss": 0.6483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8643, "tokens_per_second_per_gpu": 10550.15, "total_tokens": 853335998 }, { "epoch": 0.5403850962740685, "grad_norm": 0.8667542934417725, "learning_rate": 2e-05, "loss": 0.6695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8644, "tokens_per_second_per_gpu": 11095.34, "total_tokens": 853440875 }, { "epoch": 0.5404476119029757, "grad_norm": 0.8976192474365234, "learning_rate": 2e-05, "loss": 0.6369, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8645, "tokens_per_second_per_gpu": 10686.97, "total_tokens": 853537701 }, { "epoch": 0.540510127531883, "grad_norm": 0.9191345572471619, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8646, "tokens_per_second_per_gpu": 10088.63, "total_tokens": 853633011 }, { "epoch": 0.5405726431607902, "grad_norm": 0.8876688480377197, "learning_rate": 2e-05, "loss": 0.6211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8647, "tokens_per_second_per_gpu": 10725.48, "total_tokens": 853731594 }, { "epoch": 0.5406351587896975, "grad_norm": 0.9025980234146118, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8648, "tokens_per_second_per_gpu": 10215.58, "total_tokens": 853830204 }, { "epoch": 0.5406976744186046, "grad_norm": 0.8954533934593201, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8649, "tokens_per_second_per_gpu": 10594.68, "total_tokens": 853930110 }, { "epoch": 0.5407601900475119, "grad_norm": 0.8603057861328125, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8650, "tokens_per_second_per_gpu": 10802.13, "total_tokens": 854032797 }, { "epoch": 0.5408227056764191, "grad_norm": 0.9077618718147278, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8651, "tokens_per_second_per_gpu": 10616.3, "total_tokens": 854129635 }, { "epoch": 0.5408852213053263, "grad_norm": 0.9263349175453186, "learning_rate": 2e-05, "loss": 0.682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8652, "tokens_per_second_per_gpu": 10799.1, "total_tokens": 854227097 }, { "epoch": 0.5409477369342336, "grad_norm": 0.9308435320854187, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8653, "tokens_per_second_per_gpu": 9748.78, "total_tokens": 854321481 }, { "epoch": 0.5410102525631408, "grad_norm": 0.9166691303253174, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8654, "tokens_per_second_per_gpu": 10593.59, "total_tokens": 854413488 }, { "epoch": 0.541072768192048, "grad_norm": 0.8808215260505676, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8655, "tokens_per_second_per_gpu": 11285.14, "total_tokens": 854515270 }, { "epoch": 0.5411352838209552, "grad_norm": 0.8901917338371277, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8656, "tokens_per_second_per_gpu": 11094.41, "total_tokens": 854617285 }, { "epoch": 0.5411977994498625, "grad_norm": 0.9110189080238342, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8657, "tokens_per_second_per_gpu": 10243.39, "total_tokens": 854716484 }, { "epoch": 0.5412603150787697, "grad_norm": 0.8392737507820129, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8658, "tokens_per_second_per_gpu": 10408.29, "total_tokens": 854820417 }, { "epoch": 0.541322830707677, "grad_norm": 0.8499301075935364, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8659, "tokens_per_second_per_gpu": 11274.46, "total_tokens": 854924525 }, { "epoch": 0.5413853463365842, "grad_norm": 0.9152110815048218, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8660, "tokens_per_second_per_gpu": 10447.27, "total_tokens": 855018569 }, { "epoch": 0.5414478619654913, "grad_norm": 0.8490122556686401, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8661, "tokens_per_second_per_gpu": 11019.0, "total_tokens": 855117935 }, { "epoch": 0.5415103775943986, "grad_norm": 0.9136269688606262, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8662, "tokens_per_second_per_gpu": 9978.98, "total_tokens": 855210977 }, { "epoch": 0.5415728932233058, "grad_norm": 0.9135362505912781, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8663, "tokens_per_second_per_gpu": 11680.98, "total_tokens": 855308344 }, { "epoch": 0.5416354088522131, "grad_norm": 0.9174232482910156, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8664, "tokens_per_second_per_gpu": 10770.71, "total_tokens": 855407507 }, { "epoch": 0.5416979244811203, "grad_norm": 0.8972058296203613, "learning_rate": 2e-05, "loss": 0.6833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8665, "tokens_per_second_per_gpu": 10997.48, "total_tokens": 855506791 }, { "epoch": 0.5417604401100276, "grad_norm": 0.868038535118103, "learning_rate": 2e-05, "loss": 0.6426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8666, "tokens_per_second_per_gpu": 10360.75, "total_tokens": 855606123 }, { "epoch": 0.5418229557389347, "grad_norm": 0.9332579374313354, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8667, "tokens_per_second_per_gpu": 10179.55, "total_tokens": 855700895 }, { "epoch": 0.5418854713678419, "grad_norm": 0.875525712966919, "learning_rate": 2e-05, "loss": 0.6769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8668, "tokens_per_second_per_gpu": 11306.82, "total_tokens": 855805850 }, { "epoch": 0.5419479869967492, "grad_norm": 0.9170960187911987, "learning_rate": 2e-05, "loss": 0.6902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8669, "tokens_per_second_per_gpu": 10616.16, "total_tokens": 855907371 }, { "epoch": 0.5420105026256564, "grad_norm": 0.9004307985305786, "learning_rate": 2e-05, "loss": 0.6959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8670, "tokens_per_second_per_gpu": 10539.95, "total_tokens": 856009096 }, { "epoch": 0.5420730182545637, "grad_norm": 0.8932983875274658, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8671, "tokens_per_second_per_gpu": 10563.78, "total_tokens": 856107985 }, { "epoch": 0.5421355338834709, "grad_norm": 0.8905524611473083, "learning_rate": 2e-05, "loss": 0.6244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8672, "tokens_per_second_per_gpu": 10978.43, "total_tokens": 856206360 }, { "epoch": 0.542198049512378, "grad_norm": 0.8759914040565491, "learning_rate": 2e-05, "loss": 0.5855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8673, "tokens_per_second_per_gpu": 10199.54, "total_tokens": 856304742 }, { "epoch": 0.5422605651412853, "grad_norm": 0.893202543258667, "learning_rate": 2e-05, "loss": 0.6086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8674, "tokens_per_second_per_gpu": 10599.64, "total_tokens": 856402034 }, { "epoch": 0.5423230807701925, "grad_norm": 0.9382790327072144, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8675, "tokens_per_second_per_gpu": 9770.52, "total_tokens": 856495872 }, { "epoch": 0.5423855963990998, "grad_norm": 0.85859614610672, "learning_rate": 2e-05, "loss": 0.5769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8676, "tokens_per_second_per_gpu": 9453.7, "total_tokens": 856589396 }, { "epoch": 0.542448112028007, "grad_norm": 0.8748466372489929, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8677, "tokens_per_second_per_gpu": 10730.69, "total_tokens": 856690127 }, { "epoch": 0.5425106276569143, "grad_norm": 0.925590991973877, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8678, "tokens_per_second_per_gpu": 10675.61, "total_tokens": 856784444 }, { "epoch": 0.5425731432858214, "grad_norm": 0.9067526459693909, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8679, "tokens_per_second_per_gpu": 10195.81, "total_tokens": 856880122 }, { "epoch": 0.5426356589147286, "grad_norm": 1.0545755624771118, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8680, "tokens_per_second_per_gpu": 9754.09, "total_tokens": 856972790 }, { "epoch": 0.5426981745436359, "grad_norm": 0.8832362294197083, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8681, "tokens_per_second_per_gpu": 10722.11, "total_tokens": 857072751 }, { "epoch": 0.5427606901725431, "grad_norm": 0.8847081065177917, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8682, "tokens_per_second_per_gpu": 10237.24, "total_tokens": 857174454 }, { "epoch": 0.5428232058014504, "grad_norm": 0.9519569277763367, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8683, "tokens_per_second_per_gpu": 10454.24, "total_tokens": 857276270 }, { "epoch": 0.5428857214303576, "grad_norm": 0.8868563771247864, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8684, "tokens_per_second_per_gpu": 10606.29, "total_tokens": 857374834 }, { "epoch": 0.5429482370592649, "grad_norm": 0.928360641002655, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8685, "tokens_per_second_per_gpu": 10419.67, "total_tokens": 857470787 }, { "epoch": 0.543010752688172, "grad_norm": 0.9273526072502136, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8686, "tokens_per_second_per_gpu": 10442.31, "total_tokens": 857569984 }, { "epoch": 0.5430732683170792, "grad_norm": 0.9161180257797241, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8687, "tokens_per_second_per_gpu": 9816.14, "total_tokens": 857665500 }, { "epoch": 0.5431357839459865, "grad_norm": 0.9156473278999329, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8688, "tokens_per_second_per_gpu": 9829.49, "total_tokens": 857759203 }, { "epoch": 0.5431982995748937, "grad_norm": 0.8816787600517273, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8689, "tokens_per_second_per_gpu": 10905.43, "total_tokens": 857857359 }, { "epoch": 0.543260815203801, "grad_norm": 0.8852891325950623, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8690, "tokens_per_second_per_gpu": 10983.16, "total_tokens": 857957100 }, { "epoch": 0.5433233308327082, "grad_norm": 0.9279139041900635, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8691, "tokens_per_second_per_gpu": 11163.59, "total_tokens": 858058214 }, { "epoch": 0.5433858464616154, "grad_norm": 0.8981950283050537, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8692, "tokens_per_second_per_gpu": 10482.19, "total_tokens": 858157002 }, { "epoch": 0.5434483620905226, "grad_norm": 0.8892004489898682, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8693, "tokens_per_second_per_gpu": 11247.69, "total_tokens": 858260979 }, { "epoch": 0.5435108777194299, "grad_norm": 0.9479150772094727, "learning_rate": 2e-05, "loss": 0.6209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8694, "tokens_per_second_per_gpu": 10262.43, "total_tokens": 858356425 }, { "epoch": 0.5435733933483371, "grad_norm": 0.8597239255905151, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8695, "tokens_per_second_per_gpu": 10319.7, "total_tokens": 858455954 }, { "epoch": 0.5436359089772443, "grad_norm": 0.832676351070404, "learning_rate": 2e-05, "loss": 0.5869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8696, "tokens_per_second_per_gpu": 10558.82, "total_tokens": 858558818 }, { "epoch": 0.5436984246061516, "grad_norm": 0.8995190858840942, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8697, "tokens_per_second_per_gpu": 10876.0, "total_tokens": 858660987 }, { "epoch": 0.5437609402350587, "grad_norm": 0.8695652484893799, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8698, "tokens_per_second_per_gpu": 10953.45, "total_tokens": 858761369 }, { "epoch": 0.543823455863966, "grad_norm": 0.8940812349319458, "learning_rate": 2e-05, "loss": 0.6209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8699, "tokens_per_second_per_gpu": 10513.32, "total_tokens": 858857973 }, { "epoch": 0.5438859714928732, "grad_norm": 0.9231239557266235, "learning_rate": 2e-05, "loss": 0.6569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8700, "tokens_per_second_per_gpu": 10222.24, "total_tokens": 858955572 }, { "epoch": 0.5439484871217805, "grad_norm": 0.8876085877418518, "learning_rate": 2e-05, "loss": 0.6536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8701, "tokens_per_second_per_gpu": 10159.57, "total_tokens": 859057971 }, { "epoch": 0.5440110027506877, "grad_norm": 0.9590288400650024, "learning_rate": 2e-05, "loss": 0.6623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8702, "tokens_per_second_per_gpu": 9735.6, "total_tokens": 859156007 }, { "epoch": 0.544073518379595, "grad_norm": 0.9005311131477356, "learning_rate": 2e-05, "loss": 0.6646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8703, "tokens_per_second_per_gpu": 10523.28, "total_tokens": 859251083 }, { "epoch": 0.5441360340085021, "grad_norm": 0.8757197260856628, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8704, "tokens_per_second_per_gpu": 10533.3, "total_tokens": 859346464 }, { "epoch": 0.5441985496374093, "grad_norm": 0.885715901851654, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8705, "tokens_per_second_per_gpu": 9898.21, "total_tokens": 859443256 }, { "epoch": 0.5442610652663166, "grad_norm": 0.8884603977203369, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8706, "tokens_per_second_per_gpu": 10846.62, "total_tokens": 859539965 }, { "epoch": 0.5443235808952238, "grad_norm": 0.8731570243835449, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8707, "tokens_per_second_per_gpu": 10421.59, "total_tokens": 859639991 }, { "epoch": 0.5443860965241311, "grad_norm": 0.8912026286125183, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8708, "tokens_per_second_per_gpu": 10497.47, "total_tokens": 859736530 }, { "epoch": 0.5444486121530383, "grad_norm": 0.9198052287101746, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8709, "tokens_per_second_per_gpu": 10892.94, "total_tokens": 859835622 }, { "epoch": 0.5445111277819454, "grad_norm": 0.881779134273529, "learning_rate": 2e-05, "loss": 0.6561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8710, "tokens_per_second_per_gpu": 10242.99, "total_tokens": 859932859 }, { "epoch": 0.5445736434108527, "grad_norm": 0.856915295124054, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8711, "tokens_per_second_per_gpu": 10371.05, "total_tokens": 860029738 }, { "epoch": 0.5446361590397599, "grad_norm": 0.9108766913414001, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8712, "tokens_per_second_per_gpu": 9770.15, "total_tokens": 860120773 }, { "epoch": 0.5446986746686672, "grad_norm": 0.9070941209793091, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8713, "tokens_per_second_per_gpu": 10832.1, "total_tokens": 860221341 }, { "epoch": 0.5447611902975744, "grad_norm": 0.8980939984321594, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8714, "tokens_per_second_per_gpu": 10113.04, "total_tokens": 860315153 }, { "epoch": 0.5448237059264817, "grad_norm": 0.9008875489234924, "learning_rate": 2e-05, "loss": 0.6654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8715, "tokens_per_second_per_gpu": 10927.2, "total_tokens": 860413563 }, { "epoch": 0.5448862215553888, "grad_norm": 0.8982216119766235, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8716, "tokens_per_second_per_gpu": 10866.67, "total_tokens": 860515440 }, { "epoch": 0.544948737184296, "grad_norm": 0.9265142679214478, "learning_rate": 2e-05, "loss": 0.6952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8717, "tokens_per_second_per_gpu": 10569.05, "total_tokens": 860614395 }, { "epoch": 0.5450112528132033, "grad_norm": 0.8833805918693542, "learning_rate": 2e-05, "loss": 0.6052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8718, "tokens_per_second_per_gpu": 9618.94, "total_tokens": 860708062 }, { "epoch": 0.5450737684421105, "grad_norm": 0.8971924185752869, "learning_rate": 2e-05, "loss": 0.5975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8719, "tokens_per_second_per_gpu": 9937.24, "total_tokens": 860805903 }, { "epoch": 0.5451362840710178, "grad_norm": 0.9068397283554077, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8720, "tokens_per_second_per_gpu": 9808.2, "total_tokens": 860902700 }, { "epoch": 0.545198799699925, "grad_norm": 0.9057365655899048, "learning_rate": 2e-05, "loss": 0.6146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8721, "tokens_per_second_per_gpu": 9773.3, "total_tokens": 860999744 }, { "epoch": 0.5452613153288323, "grad_norm": 0.9008947610855103, "learning_rate": 2e-05, "loss": 0.6628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8722, "tokens_per_second_per_gpu": 10421.37, "total_tokens": 861095230 }, { "epoch": 0.5453238309577394, "grad_norm": 0.8816707730293274, "learning_rate": 2e-05, "loss": 0.6651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8723, "tokens_per_second_per_gpu": 11045.04, "total_tokens": 861197006 }, { "epoch": 0.5453863465866466, "grad_norm": 0.8782027363777161, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8724, "tokens_per_second_per_gpu": 10729.63, "total_tokens": 861295472 }, { "epoch": 0.5454488622155539, "grad_norm": 0.9160271286964417, "learning_rate": 2e-05, "loss": 0.6364, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8725, "tokens_per_second_per_gpu": 11029.55, "total_tokens": 861394435 }, { "epoch": 0.5455113778444611, "grad_norm": 0.8562865257263184, "learning_rate": 2e-05, "loss": 0.5732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8726, "tokens_per_second_per_gpu": 10457.07, "total_tokens": 861490574 }, { "epoch": 0.5455738934733684, "grad_norm": 0.9007021188735962, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8727, "tokens_per_second_per_gpu": 10741.25, "total_tokens": 861592526 }, { "epoch": 0.5456364091022756, "grad_norm": 0.9016844034194946, "learning_rate": 2e-05, "loss": 0.6468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8728, "tokens_per_second_per_gpu": 10603.89, "total_tokens": 861691191 }, { "epoch": 0.5456989247311828, "grad_norm": 0.9004611372947693, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8729, "tokens_per_second_per_gpu": 10776.3, "total_tokens": 861786370 }, { "epoch": 0.54576144036009, "grad_norm": 0.9101533889770508, "learning_rate": 2e-05, "loss": 0.6569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8730, "tokens_per_second_per_gpu": 11020.86, "total_tokens": 861887078 }, { "epoch": 0.5458239559889972, "grad_norm": 0.8558598160743713, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8731, "tokens_per_second_per_gpu": 10054.99, "total_tokens": 861986772 }, { "epoch": 0.5458864716179045, "grad_norm": 0.9278980493545532, "learning_rate": 2e-05, "loss": 0.6926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8732, "tokens_per_second_per_gpu": 10659.28, "total_tokens": 862086783 }, { "epoch": 0.5459489872468117, "grad_norm": 0.8812701106071472, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8733, "tokens_per_second_per_gpu": 10853.51, "total_tokens": 862187240 }, { "epoch": 0.546011502875719, "grad_norm": 0.9180045127868652, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8734, "tokens_per_second_per_gpu": 9347.34, "total_tokens": 862284327 }, { "epoch": 0.5460740185046261, "grad_norm": 0.8734992146492004, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8735, "tokens_per_second_per_gpu": 10722.73, "total_tokens": 862384514 }, { "epoch": 0.5461365341335334, "grad_norm": 0.9095596671104431, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8736, "tokens_per_second_per_gpu": 10271.13, "total_tokens": 862480952 }, { "epoch": 0.5461990497624406, "grad_norm": 0.9049538969993591, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8737, "tokens_per_second_per_gpu": 10089.71, "total_tokens": 862575421 }, { "epoch": 0.5462615653913478, "grad_norm": 0.8521301746368408, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8738, "tokens_per_second_per_gpu": 9913.29, "total_tokens": 862672506 }, { "epoch": 0.5463240810202551, "grad_norm": 0.8806836009025574, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8739, "tokens_per_second_per_gpu": 10943.73, "total_tokens": 862770712 }, { "epoch": 0.5463865966491623, "grad_norm": 0.8654487729072571, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8740, "tokens_per_second_per_gpu": 11515.06, "total_tokens": 862873157 }, { "epoch": 0.5464491122780695, "grad_norm": 0.932665228843689, "learning_rate": 2e-05, "loss": 0.6841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8741, "tokens_per_second_per_gpu": 10909.3, "total_tokens": 862972217 }, { "epoch": 0.5465116279069767, "grad_norm": 0.8883419632911682, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8742, "tokens_per_second_per_gpu": 10628.26, "total_tokens": 863070405 }, { "epoch": 0.546574143535884, "grad_norm": 0.875584602355957, "learning_rate": 2e-05, "loss": 0.605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8743, "tokens_per_second_per_gpu": 10154.61, "total_tokens": 863164359 }, { "epoch": 0.5466366591647912, "grad_norm": 0.8825988173484802, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8744, "tokens_per_second_per_gpu": 10605.07, "total_tokens": 863258594 }, { "epoch": 0.5466991747936985, "grad_norm": 0.8808729648590088, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8745, "tokens_per_second_per_gpu": 10843.13, "total_tokens": 863363986 }, { "epoch": 0.5467616904226057, "grad_norm": 0.8847880959510803, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8746, "tokens_per_second_per_gpu": 11031.39, "total_tokens": 863467208 }, { "epoch": 0.5468242060515128, "grad_norm": 0.8639325499534607, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8747, "tokens_per_second_per_gpu": 11173.37, "total_tokens": 863566809 }, { "epoch": 0.5468867216804201, "grad_norm": 0.9165849089622498, "learning_rate": 2e-05, "loss": 0.6679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8748, "tokens_per_second_per_gpu": 10428.31, "total_tokens": 863663709 }, { "epoch": 0.5469492373093273, "grad_norm": 0.8330210447311401, "learning_rate": 2e-05, "loss": 0.5867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8749, "tokens_per_second_per_gpu": 11589.01, "total_tokens": 863762858 }, { "epoch": 0.5470117529382346, "grad_norm": 0.882813572883606, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8750, "tokens_per_second_per_gpu": 10286.13, "total_tokens": 863861873 }, { "epoch": 0.5470742685671418, "grad_norm": 0.895159900188446, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8751, "tokens_per_second_per_gpu": 10075.01, "total_tokens": 863963081 }, { "epoch": 0.547136784196049, "grad_norm": 0.8657933473587036, "learning_rate": 2e-05, "loss": 0.677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8752, "tokens_per_second_per_gpu": 11272.07, "total_tokens": 864064767 }, { "epoch": 0.5471992998249562, "grad_norm": 0.9044924378395081, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8753, "tokens_per_second_per_gpu": 10359.37, "total_tokens": 864164090 }, { "epoch": 0.5472618154538634, "grad_norm": 0.9234239459037781, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8754, "tokens_per_second_per_gpu": 10111.01, "total_tokens": 864259298 }, { "epoch": 0.5473243310827707, "grad_norm": 0.8872508406639099, "learning_rate": 2e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8755, "tokens_per_second_per_gpu": 10943.43, "total_tokens": 864363181 }, { "epoch": 0.5473868467116779, "grad_norm": 0.9310570359230042, "learning_rate": 2e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8756, "tokens_per_second_per_gpu": 10656.61, "total_tokens": 864456335 }, { "epoch": 0.5474493623405852, "grad_norm": 0.8423327803611755, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8757, "tokens_per_second_per_gpu": 10469.3, "total_tokens": 864558380 }, { "epoch": 0.5475118779694924, "grad_norm": 0.8713973760604858, "learning_rate": 2e-05, "loss": 0.6448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8758, "tokens_per_second_per_gpu": 10966.29, "total_tokens": 864659037 }, { "epoch": 0.5475743935983995, "grad_norm": 0.8729890584945679, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8759, "tokens_per_second_per_gpu": 10328.71, "total_tokens": 864756811 }, { "epoch": 0.5476369092273068, "grad_norm": 0.9049729108810425, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8760, "tokens_per_second_per_gpu": 10543.24, "total_tokens": 864852665 }, { "epoch": 0.547699424856214, "grad_norm": 0.8883662223815918, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8761, "tokens_per_second_per_gpu": 10252.33, "total_tokens": 864949509 }, { "epoch": 0.5477619404851213, "grad_norm": 0.8714340329170227, "learning_rate": 2e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8762, "tokens_per_second_per_gpu": 9821.27, "total_tokens": 865048445 }, { "epoch": 0.5478244561140285, "grad_norm": 0.908344566822052, "learning_rate": 2e-05, "loss": 0.6676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8763, "tokens_per_second_per_gpu": 10634.78, "total_tokens": 865144976 }, { "epoch": 0.5478869717429358, "grad_norm": 0.8873301148414612, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8764, "tokens_per_second_per_gpu": 10245.76, "total_tokens": 865242496 }, { "epoch": 0.547949487371843, "grad_norm": 0.8737460970878601, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8765, "tokens_per_second_per_gpu": 11403.58, "total_tokens": 865343299 }, { "epoch": 0.5480120030007501, "grad_norm": 0.8612310290336609, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8766, "tokens_per_second_per_gpu": 9610.32, "total_tokens": 865438598 }, { "epoch": 0.5480745186296574, "grad_norm": 0.8698450326919556, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8767, "tokens_per_second_per_gpu": 10132.12, "total_tokens": 865537971 }, { "epoch": 0.5481370342585646, "grad_norm": 0.9288782477378845, "learning_rate": 2e-05, "loss": 0.6623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8768, "tokens_per_second_per_gpu": 11281.3, "total_tokens": 865634170 }, { "epoch": 0.5481995498874719, "grad_norm": 0.8697952032089233, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8769, "tokens_per_second_per_gpu": 10815.57, "total_tokens": 865731486 }, { "epoch": 0.5482620655163791, "grad_norm": 0.8939792513847351, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8770, "tokens_per_second_per_gpu": 10608.0, "total_tokens": 865829449 }, { "epoch": 0.5483245811452864, "grad_norm": 0.8809971809387207, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8771, "tokens_per_second_per_gpu": 10423.04, "total_tokens": 865928754 }, { "epoch": 0.5483870967741935, "grad_norm": 0.8849808573722839, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8772, "tokens_per_second_per_gpu": 10422.26, "total_tokens": 866029859 }, { "epoch": 0.5484496124031008, "grad_norm": 0.8754029870033264, "learning_rate": 2e-05, "loss": 0.6602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8773, "tokens_per_second_per_gpu": 10496.78, "total_tokens": 866128729 }, { "epoch": 0.548512128032008, "grad_norm": 0.9131746292114258, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8774, "tokens_per_second_per_gpu": 10698.02, "total_tokens": 866228180 }, { "epoch": 0.5485746436609152, "grad_norm": 0.9372706413269043, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8775, "tokens_per_second_per_gpu": 10268.43, "total_tokens": 866326391 }, { "epoch": 0.5486371592898225, "grad_norm": 0.9420503377914429, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8776, "tokens_per_second_per_gpu": 10696.32, "total_tokens": 866427909 }, { "epoch": 0.5486996749187297, "grad_norm": 0.8632596135139465, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8777, "tokens_per_second_per_gpu": 10707.34, "total_tokens": 866530344 }, { "epoch": 0.5487621905476369, "grad_norm": 0.8822471499443054, "learning_rate": 2e-05, "loss": 0.6821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8778, "tokens_per_second_per_gpu": 10576.18, "total_tokens": 866630491 }, { "epoch": 0.5488247061765441, "grad_norm": 0.8942146897315979, "learning_rate": 2e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8779, "tokens_per_second_per_gpu": 10629.11, "total_tokens": 866729152 }, { "epoch": 0.5488872218054514, "grad_norm": 0.909447968006134, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8780, "tokens_per_second_per_gpu": 10866.24, "total_tokens": 866827868 }, { "epoch": 0.5489497374343586, "grad_norm": 0.8761423230171204, "learning_rate": 2e-05, "loss": 0.6549, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8781, "tokens_per_second_per_gpu": 11054.44, "total_tokens": 866931270 }, { "epoch": 0.5490122530632658, "grad_norm": 0.9123244285583496, "learning_rate": 2e-05, "loss": 0.6973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8782, "tokens_per_second_per_gpu": 11340.27, "total_tokens": 867033870 }, { "epoch": 0.5490747686921731, "grad_norm": 0.9439188838005066, "learning_rate": 2e-05, "loss": 0.6507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8783, "tokens_per_second_per_gpu": 10355.08, "total_tokens": 867132573 }, { "epoch": 0.5491372843210802, "grad_norm": 0.8638684153556824, "learning_rate": 2e-05, "loss": 0.6608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8784, "tokens_per_second_per_gpu": 11245.87, "total_tokens": 867236000 }, { "epoch": 0.5491997999499875, "grad_norm": 0.8570787310600281, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8785, "tokens_per_second_per_gpu": 10062.99, "total_tokens": 867334243 }, { "epoch": 0.5492623155788947, "grad_norm": 0.9005186557769775, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8786, "tokens_per_second_per_gpu": 9993.15, "total_tokens": 867428172 }, { "epoch": 0.549324831207802, "grad_norm": 0.9041737914085388, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8787, "tokens_per_second_per_gpu": 11242.14, "total_tokens": 867527139 }, { "epoch": 0.5493873468367092, "grad_norm": 0.8860406875610352, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8788, "tokens_per_second_per_gpu": 10285.63, "total_tokens": 867629104 }, { "epoch": 0.5494498624656164, "grad_norm": 0.918547511100769, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8789, "tokens_per_second_per_gpu": 10630.32, "total_tokens": 867726682 }, { "epoch": 0.5495123780945236, "grad_norm": 0.9078040719032288, "learning_rate": 2e-05, "loss": 0.6574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8790, "tokens_per_second_per_gpu": 10387.09, "total_tokens": 867825982 }, { "epoch": 0.5495748937234308, "grad_norm": 0.9536702036857605, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8791, "tokens_per_second_per_gpu": 10091.71, "total_tokens": 867922982 }, { "epoch": 0.5496374093523381, "grad_norm": 0.8750367164611816, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8792, "tokens_per_second_per_gpu": 9899.03, "total_tokens": 868024201 }, { "epoch": 0.5496999249812453, "grad_norm": 0.8682727217674255, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8793, "tokens_per_second_per_gpu": 10651.62, "total_tokens": 868124177 }, { "epoch": 0.5497624406101526, "grad_norm": 0.895214319229126, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8794, "tokens_per_second_per_gpu": 10371.02, "total_tokens": 868220594 }, { "epoch": 0.5498249562390598, "grad_norm": 0.8844693899154663, "learning_rate": 2e-05, "loss": 0.6604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8795, "tokens_per_second_per_gpu": 9516.54, "total_tokens": 868318358 }, { "epoch": 0.5498874718679669, "grad_norm": 0.8968930840492249, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8796, "tokens_per_second_per_gpu": 10711.55, "total_tokens": 868416623 }, { "epoch": 0.5499499874968742, "grad_norm": 0.9000688195228577, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8797, "tokens_per_second_per_gpu": 10489.38, "total_tokens": 868515183 }, { "epoch": 0.5500125031257814, "grad_norm": 0.9159275889396667, "learning_rate": 2e-05, "loss": 0.7195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8798, "tokens_per_second_per_gpu": 13878.11, "total_tokens": 868617605 }, { "epoch": 0.5500750187546887, "grad_norm": 0.9068820476531982, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8799, "tokens_per_second_per_gpu": 12990.97, "total_tokens": 868712135 }, { "epoch": 0.5501375343835959, "grad_norm": 0.868707537651062, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8800, "tokens_per_second_per_gpu": 10740.46, "total_tokens": 868815853 }, { "epoch": 0.5502000500125032, "grad_norm": 0.8775017261505127, "learning_rate": 2e-05, "loss": 0.6008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8801, "tokens_per_second_per_gpu": 10038.02, "total_tokens": 868913221 }, { "epoch": 0.5502625656414104, "grad_norm": 0.8610165119171143, "learning_rate": 2e-05, "loss": 0.596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8802, "tokens_per_second_per_gpu": 9868.02, "total_tokens": 869009385 }, { "epoch": 0.5503250812703175, "grad_norm": 0.8892348408699036, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8803, "tokens_per_second_per_gpu": 10246.57, "total_tokens": 869106241 }, { "epoch": 0.5503875968992248, "grad_norm": 0.8809367418289185, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8804, "tokens_per_second_per_gpu": 10890.0, "total_tokens": 869206836 }, { "epoch": 0.550450112528132, "grad_norm": 0.8739306330680847, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8805, "tokens_per_second_per_gpu": 11392.13, "total_tokens": 869312539 }, { "epoch": 0.5505126281570393, "grad_norm": 0.8739521503448486, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8806, "tokens_per_second_per_gpu": 10242.78, "total_tokens": 869411493 }, { "epoch": 0.5505751437859465, "grad_norm": 0.8764354586601257, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8807, "tokens_per_second_per_gpu": 11475.14, "total_tokens": 869516447 }, { "epoch": 0.5506376594148538, "grad_norm": 0.9114106893539429, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8808, "tokens_per_second_per_gpu": 11119.68, "total_tokens": 869616538 }, { "epoch": 0.5507001750437609, "grad_norm": 0.8733433485031128, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8809, "tokens_per_second_per_gpu": 10984.81, "total_tokens": 869717465 }, { "epoch": 0.5507626906726681, "grad_norm": 0.862962543964386, "learning_rate": 2e-05, "loss": 0.6247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8810, "tokens_per_second_per_gpu": 11131.02, "total_tokens": 869820703 }, { "epoch": 0.5508252063015754, "grad_norm": 0.8904517292976379, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8811, "tokens_per_second_per_gpu": 10709.8, "total_tokens": 869919386 }, { "epoch": 0.5508877219304826, "grad_norm": 0.8833606243133545, "learning_rate": 2e-05, "loss": 0.6037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8812, "tokens_per_second_per_gpu": 10352.25, "total_tokens": 870016796 }, { "epoch": 0.5509502375593899, "grad_norm": 0.8932771682739258, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8813, "tokens_per_second_per_gpu": 10989.56, "total_tokens": 870120121 }, { "epoch": 0.5510127531882971, "grad_norm": 0.8863975405693054, "learning_rate": 2e-05, "loss": 0.6593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8814, "tokens_per_second_per_gpu": 10491.93, "total_tokens": 870218532 }, { "epoch": 0.5510752688172043, "grad_norm": 0.8865094780921936, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8815, "tokens_per_second_per_gpu": 11113.62, "total_tokens": 870320467 }, { "epoch": 0.5511377844461115, "grad_norm": 0.8660438060760498, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8816, "tokens_per_second_per_gpu": 10842.56, "total_tokens": 870418177 }, { "epoch": 0.5512003000750187, "grad_norm": 0.8643627762794495, "learning_rate": 2e-05, "loss": 0.6903, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8817, "tokens_per_second_per_gpu": 11554.03, "total_tokens": 870523354 }, { "epoch": 0.551262815703926, "grad_norm": 0.9228298664093018, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8818, "tokens_per_second_per_gpu": 10963.85, "total_tokens": 870623204 }, { "epoch": 0.5513253313328332, "grad_norm": 0.8915716409683228, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8819, "tokens_per_second_per_gpu": 10482.44, "total_tokens": 870723279 }, { "epoch": 0.5513878469617405, "grad_norm": 0.887165367603302, "learning_rate": 2e-05, "loss": 0.6426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8820, "tokens_per_second_per_gpu": 10917.35, "total_tokens": 870822388 }, { "epoch": 0.5514503625906476, "grad_norm": 0.885103166103363, "learning_rate": 2e-05, "loss": 0.6657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8821, "tokens_per_second_per_gpu": 10165.48, "total_tokens": 870923484 }, { "epoch": 0.5515128782195549, "grad_norm": 0.8435880541801453, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8822, "tokens_per_second_per_gpu": 11344.84, "total_tokens": 871027176 }, { "epoch": 0.5515753938484621, "grad_norm": 0.8771277666091919, "learning_rate": 2e-05, "loss": 0.6602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8823, "tokens_per_second_per_gpu": 11153.49, "total_tokens": 871130761 }, { "epoch": 0.5516379094773693, "grad_norm": 0.8874414563179016, "learning_rate": 2e-05, "loss": 0.6598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8824, "tokens_per_second_per_gpu": 10719.51, "total_tokens": 871232939 }, { "epoch": 0.5517004251062766, "grad_norm": 0.8521776795387268, "learning_rate": 2e-05, "loss": 0.5822, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8825, "tokens_per_second_per_gpu": 10258.64, "total_tokens": 871328590 }, { "epoch": 0.5517629407351838, "grad_norm": 0.8922610878944397, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8826, "tokens_per_second_per_gpu": 10245.24, "total_tokens": 871425129 }, { "epoch": 0.551825456364091, "grad_norm": 0.9055283069610596, "learning_rate": 2e-05, "loss": 0.6481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8827, "tokens_per_second_per_gpu": 10455.44, "total_tokens": 871520584 }, { "epoch": 0.5518879719929982, "grad_norm": 0.9350957274436951, "learning_rate": 2e-05, "loss": 0.6623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8828, "tokens_per_second_per_gpu": 10298.32, "total_tokens": 871618504 }, { "epoch": 0.5519504876219055, "grad_norm": 0.8785763382911682, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8829, "tokens_per_second_per_gpu": 10947.49, "total_tokens": 871721343 }, { "epoch": 0.5520130032508127, "grad_norm": 0.8862062096595764, "learning_rate": 2e-05, "loss": 0.5844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8830, "tokens_per_second_per_gpu": 10587.69, "total_tokens": 871816528 }, { "epoch": 0.55207551887972, "grad_norm": 0.9054977893829346, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8831, "tokens_per_second_per_gpu": 10725.66, "total_tokens": 871913503 }, { "epoch": 0.5521380345086272, "grad_norm": 0.8878880143165588, "learning_rate": 2e-05, "loss": 0.6866, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8832, "tokens_per_second_per_gpu": 11410.68, "total_tokens": 872016178 }, { "epoch": 0.5522005501375343, "grad_norm": 0.9031578898429871, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8833, "tokens_per_second_per_gpu": 9606.39, "total_tokens": 872110676 }, { "epoch": 0.5522630657664416, "grad_norm": 0.878774881362915, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8834, "tokens_per_second_per_gpu": 9987.45, "total_tokens": 872208027 }, { "epoch": 0.5523255813953488, "grad_norm": 0.8640003204345703, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8835, "tokens_per_second_per_gpu": 10329.38, "total_tokens": 872308653 }, { "epoch": 0.5523880970242561, "grad_norm": 0.8871076703071594, "learning_rate": 2e-05, "loss": 0.6477, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8836, "tokens_per_second_per_gpu": 11072.35, "total_tokens": 872409930 }, { "epoch": 0.5524506126531633, "grad_norm": 0.9149996042251587, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8837, "tokens_per_second_per_gpu": 11472.88, "total_tokens": 872513369 }, { "epoch": 0.5525131282820706, "grad_norm": 0.9041509032249451, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8838, "tokens_per_second_per_gpu": 10345.09, "total_tokens": 872609880 }, { "epoch": 0.5525756439109778, "grad_norm": 0.8750714063644409, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8839, "tokens_per_second_per_gpu": 11062.55, "total_tokens": 872712986 }, { "epoch": 0.5526381595398849, "grad_norm": 0.9124102592468262, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8840, "tokens_per_second_per_gpu": 10991.83, "total_tokens": 872811368 }, { "epoch": 0.5527006751687922, "grad_norm": 0.8748160004615784, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8841, "tokens_per_second_per_gpu": 9961.42, "total_tokens": 872907126 }, { "epoch": 0.5527631907976994, "grad_norm": 0.8890608549118042, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8842, "tokens_per_second_per_gpu": 10793.42, "total_tokens": 873009682 }, { "epoch": 0.5528257064266067, "grad_norm": 0.8689658641815186, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8843, "tokens_per_second_per_gpu": 10750.95, "total_tokens": 873107759 }, { "epoch": 0.5528882220555139, "grad_norm": 0.8628271818161011, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8844, "tokens_per_second_per_gpu": 10934.82, "total_tokens": 873206326 }, { "epoch": 0.5529507376844212, "grad_norm": 0.9427613019943237, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8845, "tokens_per_second_per_gpu": 10470.15, "total_tokens": 873300409 }, { "epoch": 0.5530132533133283, "grad_norm": 0.8598540425300598, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8846, "tokens_per_second_per_gpu": 10578.61, "total_tokens": 873400200 }, { "epoch": 0.5530757689422355, "grad_norm": 0.895449161529541, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8847, "tokens_per_second_per_gpu": 10974.93, "total_tokens": 873501386 }, { "epoch": 0.5531382845711428, "grad_norm": 0.8790807723999023, "learning_rate": 2e-05, "loss": 0.5934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8848, "tokens_per_second_per_gpu": 10549.93, "total_tokens": 873596004 }, { "epoch": 0.55320080020005, "grad_norm": 0.8971964120864868, "learning_rate": 2e-05, "loss": 0.5875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8849, "tokens_per_second_per_gpu": 10460.69, "total_tokens": 873689483 }, { "epoch": 0.5532633158289573, "grad_norm": 0.8936404585838318, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8850, "tokens_per_second_per_gpu": 10464.76, "total_tokens": 873789002 }, { "epoch": 0.5533258314578645, "grad_norm": 0.8635317087173462, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8851, "tokens_per_second_per_gpu": 10848.25, "total_tokens": 873892171 }, { "epoch": 0.5533883470867716, "grad_norm": 0.8345627784729004, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8852, "tokens_per_second_per_gpu": 10449.77, "total_tokens": 873989538 }, { "epoch": 0.5534508627156789, "grad_norm": 0.8422046303749084, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8853, "tokens_per_second_per_gpu": 10913.82, "total_tokens": 874090560 }, { "epoch": 0.5535133783445861, "grad_norm": 0.8852335214614868, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8854, "tokens_per_second_per_gpu": 10608.04, "total_tokens": 874192571 }, { "epoch": 0.5535758939734934, "grad_norm": 0.8947763442993164, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8855, "tokens_per_second_per_gpu": 10450.23, "total_tokens": 874289652 }, { "epoch": 0.5536384096024006, "grad_norm": 0.886052131652832, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8856, "tokens_per_second_per_gpu": 10159.9, "total_tokens": 874387176 }, { "epoch": 0.5537009252313079, "grad_norm": 0.8745972514152527, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8857, "tokens_per_second_per_gpu": 10235.95, "total_tokens": 874483392 }, { "epoch": 0.553763440860215, "grad_norm": 0.8766373991966248, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8858, "tokens_per_second_per_gpu": 10668.88, "total_tokens": 874579657 }, { "epoch": 0.5538259564891223, "grad_norm": 0.891980767250061, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8859, "tokens_per_second_per_gpu": 9889.02, "total_tokens": 874672841 }, { "epoch": 0.5538884721180295, "grad_norm": 0.9273962378501892, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8860, "tokens_per_second_per_gpu": 10699.85, "total_tokens": 874775402 }, { "epoch": 0.5539509877469367, "grad_norm": 0.9272983074188232, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8861, "tokens_per_second_per_gpu": 10144.62, "total_tokens": 874869337 }, { "epoch": 0.554013503375844, "grad_norm": 0.9175335168838501, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8862, "tokens_per_second_per_gpu": 9547.39, "total_tokens": 874962914 }, { "epoch": 0.5540760190047512, "grad_norm": 0.8966027498245239, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8863, "tokens_per_second_per_gpu": 11241.44, "total_tokens": 875065508 }, { "epoch": 0.5541385346336584, "grad_norm": 0.862176239490509, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8864, "tokens_per_second_per_gpu": 11172.39, "total_tokens": 875166462 }, { "epoch": 0.5542010502625656, "grad_norm": 0.9166427850723267, "learning_rate": 2e-05, "loss": 0.6422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8865, "tokens_per_second_per_gpu": 10573.41, "total_tokens": 875264789 }, { "epoch": 0.5542635658914729, "grad_norm": 0.9162874817848206, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8866, "tokens_per_second_per_gpu": 9223.69, "total_tokens": 875357658 }, { "epoch": 0.5543260815203801, "grad_norm": 0.8796079158782959, "learning_rate": 2e-05, "loss": 0.6431, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8867, "tokens_per_second_per_gpu": 10138.12, "total_tokens": 875456932 }, { "epoch": 0.5543885971492873, "grad_norm": 0.846620500087738, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8868, "tokens_per_second_per_gpu": 11306.22, "total_tokens": 875558139 }, { "epoch": 0.5544511127781946, "grad_norm": 0.9109795093536377, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8869, "tokens_per_second_per_gpu": 9967.22, "total_tokens": 875655010 }, { "epoch": 0.5545136284071017, "grad_norm": 0.898274838924408, "learning_rate": 2e-05, "loss": 0.6536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8870, "tokens_per_second_per_gpu": 10618.75, "total_tokens": 875754770 }, { "epoch": 0.554576144036009, "grad_norm": 0.8823093771934509, "learning_rate": 2e-05, "loss": 0.6754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8871, "tokens_per_second_per_gpu": 10743.9, "total_tokens": 875854963 }, { "epoch": 0.5546386596649162, "grad_norm": 0.9006643295288086, "learning_rate": 2e-05, "loss": 0.7256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8872, "tokens_per_second_per_gpu": 10750.33, "total_tokens": 875956181 }, { "epoch": 0.5547011752938235, "grad_norm": 0.9052042961120605, "learning_rate": 2e-05, "loss": 0.6842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8873, "tokens_per_second_per_gpu": 10636.99, "total_tokens": 876055018 }, { "epoch": 0.5547636909227307, "grad_norm": 0.8656579256057739, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8874, "tokens_per_second_per_gpu": 11086.5, "total_tokens": 876158401 }, { "epoch": 0.554826206551638, "grad_norm": 0.8640072345733643, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8875, "tokens_per_second_per_gpu": 9740.52, "total_tokens": 876253174 }, { "epoch": 0.5548887221805452, "grad_norm": 0.8851577043533325, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8876, "tokens_per_second_per_gpu": 10766.93, "total_tokens": 876353718 }, { "epoch": 0.5549512378094523, "grad_norm": 0.8722220063209534, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8877, "tokens_per_second_per_gpu": 10911.03, "total_tokens": 876455161 }, { "epoch": 0.5550137534383596, "grad_norm": 0.8531261682510376, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8878, "tokens_per_second_per_gpu": 10886.18, "total_tokens": 876554573 }, { "epoch": 0.5550762690672668, "grad_norm": 0.8553808331489563, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8879, "tokens_per_second_per_gpu": 10549.37, "total_tokens": 876654111 }, { "epoch": 0.5551387846961741, "grad_norm": 0.8676661849021912, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8880, "tokens_per_second_per_gpu": 10997.63, "total_tokens": 876756378 }, { "epoch": 0.5552013003250813, "grad_norm": 0.8541688323020935, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8881, "tokens_per_second_per_gpu": 10908.19, "total_tokens": 876856402 }, { "epoch": 0.5552638159539885, "grad_norm": 0.8660390377044678, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8882, "tokens_per_second_per_gpu": 9666.25, "total_tokens": 876955881 }, { "epoch": 0.5553263315828957, "grad_norm": 0.8556066751480103, "learning_rate": 2e-05, "loss": 0.6026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8883, "tokens_per_second_per_gpu": 10463.74, "total_tokens": 877054435 }, { "epoch": 0.5553888472118029, "grad_norm": 0.8357222080230713, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8884, "tokens_per_second_per_gpu": 10632.79, "total_tokens": 877154990 }, { "epoch": 0.5554513628407102, "grad_norm": 0.9286598563194275, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8885, "tokens_per_second_per_gpu": 10318.0, "total_tokens": 877254672 }, { "epoch": 0.5555138784696174, "grad_norm": 0.8639892935752869, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8886, "tokens_per_second_per_gpu": 11269.91, "total_tokens": 877357535 }, { "epoch": 0.5555763940985247, "grad_norm": 0.8494797348976135, "learning_rate": 2e-05, "loss": 0.5814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8887, "tokens_per_second_per_gpu": 9929.95, "total_tokens": 877451729 }, { "epoch": 0.5556389097274319, "grad_norm": 0.8903186321258545, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8888, "tokens_per_second_per_gpu": 10351.96, "total_tokens": 877550197 }, { "epoch": 0.555701425356339, "grad_norm": 0.8737451434135437, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8889, "tokens_per_second_per_gpu": 10131.37, "total_tokens": 877649081 }, { "epoch": 0.5557639409852463, "grad_norm": 0.8493471145629883, "learning_rate": 2e-05, "loss": 0.597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8890, "tokens_per_second_per_gpu": 10444.32, "total_tokens": 877747041 }, { "epoch": 0.5558264566141535, "grad_norm": 0.9497326016426086, "learning_rate": 2e-05, "loss": 0.6515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8891, "tokens_per_second_per_gpu": 10549.61, "total_tokens": 877847806 }, { "epoch": 0.5558889722430608, "grad_norm": 0.8903963565826416, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8892, "tokens_per_second_per_gpu": 11055.31, "total_tokens": 877949199 }, { "epoch": 0.555951487871968, "grad_norm": 0.8778284192085266, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8893, "tokens_per_second_per_gpu": 10946.07, "total_tokens": 878050336 }, { "epoch": 0.5560140035008753, "grad_norm": 0.8821483254432678, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8894, "tokens_per_second_per_gpu": 10957.51, "total_tokens": 878148897 }, { "epoch": 0.5560765191297824, "grad_norm": 0.9172369241714478, "learning_rate": 2e-05, "loss": 0.656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8895, "tokens_per_second_per_gpu": 10713.9, "total_tokens": 878251260 }, { "epoch": 0.5561390347586896, "grad_norm": 0.9033398628234863, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8896, "tokens_per_second_per_gpu": 10862.54, "total_tokens": 878351363 }, { "epoch": 0.5562015503875969, "grad_norm": 0.8650384545326233, "learning_rate": 2e-05, "loss": 0.5916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8897, "tokens_per_second_per_gpu": 9066.29, "total_tokens": 878447030 }, { "epoch": 0.5562640660165041, "grad_norm": 0.8851843476295471, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8898, "tokens_per_second_per_gpu": 10903.12, "total_tokens": 878547989 }, { "epoch": 0.5563265816454114, "grad_norm": 0.9006678462028503, "learning_rate": 2e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8899, "tokens_per_second_per_gpu": 10496.51, "total_tokens": 878646944 }, { "epoch": 0.5563890972743186, "grad_norm": 0.9135319590568542, "learning_rate": 2e-05, "loss": 0.6725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8900, "tokens_per_second_per_gpu": 9868.66, "total_tokens": 878745268 }, { "epoch": 0.5564516129032258, "grad_norm": 0.8786661624908447, "learning_rate": 2e-05, "loss": 0.6627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8901, "tokens_per_second_per_gpu": 11061.27, "total_tokens": 878846207 }, { "epoch": 0.556514128532133, "grad_norm": 0.8905525803565979, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8902, "tokens_per_second_per_gpu": 10895.32, "total_tokens": 878945266 }, { "epoch": 0.5565766441610402, "grad_norm": 0.8699895143508911, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8903, "tokens_per_second_per_gpu": 8907.8, "total_tokens": 879043059 }, { "epoch": 0.5566391597899475, "grad_norm": 0.9092170596122742, "learning_rate": 2e-05, "loss": 0.6694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8904, "tokens_per_second_per_gpu": 10933.75, "total_tokens": 879146190 }, { "epoch": 0.5567016754188547, "grad_norm": 0.925756573677063, "learning_rate": 2e-05, "loss": 0.7003, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8905, "tokens_per_second_per_gpu": 11340.22, "total_tokens": 879250695 }, { "epoch": 0.556764191047762, "grad_norm": 0.8607668876647949, "learning_rate": 2e-05, "loss": 0.6045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8906, "tokens_per_second_per_gpu": 11299.48, "total_tokens": 879348725 }, { "epoch": 0.5568267066766691, "grad_norm": 0.91151362657547, "learning_rate": 2e-05, "loss": 0.6729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8907, "tokens_per_second_per_gpu": 10354.89, "total_tokens": 879447012 }, { "epoch": 0.5568892223055764, "grad_norm": 0.8679163455963135, "learning_rate": 2e-05, "loss": 0.6581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8908, "tokens_per_second_per_gpu": 10653.36, "total_tokens": 879549378 }, { "epoch": 0.5569517379344836, "grad_norm": 0.8786592483520508, "learning_rate": 2e-05, "loss": 0.6034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8909, "tokens_per_second_per_gpu": 9992.25, "total_tokens": 879645872 }, { "epoch": 0.5570142535633908, "grad_norm": 0.8855708837509155, "learning_rate": 2e-05, "loss": 0.5966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8910, "tokens_per_second_per_gpu": 10112.65, "total_tokens": 879744023 }, { "epoch": 0.5570767691922981, "grad_norm": 0.88010174036026, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8911, "tokens_per_second_per_gpu": 10516.0, "total_tokens": 879840433 }, { "epoch": 0.5571392848212053, "grad_norm": 0.9278603792190552, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8912, "tokens_per_second_per_gpu": 10610.65, "total_tokens": 879940164 }, { "epoch": 0.5572018004501126, "grad_norm": 0.880797266960144, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8913, "tokens_per_second_per_gpu": 10592.44, "total_tokens": 880041192 }, { "epoch": 0.5572643160790197, "grad_norm": 0.9474532604217529, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8914, "tokens_per_second_per_gpu": 10596.63, "total_tokens": 880138888 }, { "epoch": 0.557326831707927, "grad_norm": 0.8912432193756104, "learning_rate": 2e-05, "loss": 0.6672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8915, "tokens_per_second_per_gpu": 10790.58, "total_tokens": 880239639 }, { "epoch": 0.5573893473368342, "grad_norm": 0.8725674152374268, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8916, "tokens_per_second_per_gpu": 10808.29, "total_tokens": 880340808 }, { "epoch": 0.5574518629657415, "grad_norm": 0.8983219861984253, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8917, "tokens_per_second_per_gpu": 10365.15, "total_tokens": 880436700 }, { "epoch": 0.5575143785946487, "grad_norm": 0.8921599388122559, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8918, "tokens_per_second_per_gpu": 10680.23, "total_tokens": 880537340 }, { "epoch": 0.5575768942235559, "grad_norm": 0.8868309855461121, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8919, "tokens_per_second_per_gpu": 10828.75, "total_tokens": 880637909 }, { "epoch": 0.5576394098524631, "grad_norm": 0.9206545352935791, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8920, "tokens_per_second_per_gpu": 10443.15, "total_tokens": 880732784 }, { "epoch": 0.5577019254813703, "grad_norm": 0.9005269408226013, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8921, "tokens_per_second_per_gpu": 11231.43, "total_tokens": 880835042 }, { "epoch": 0.5577644411102776, "grad_norm": 1.0464016199111938, "learning_rate": 2e-05, "loss": 0.5986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8922, "tokens_per_second_per_gpu": 10381.5, "total_tokens": 880932092 }, { "epoch": 0.5578269567391848, "grad_norm": 0.8714735507965088, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8923, "tokens_per_second_per_gpu": 10629.57, "total_tokens": 881033571 }, { "epoch": 0.557889472368092, "grad_norm": 0.8664013743400574, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8924, "tokens_per_second_per_gpu": 10856.03, "total_tokens": 881135060 }, { "epoch": 0.5579519879969993, "grad_norm": 0.8642182350158691, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8925, "tokens_per_second_per_gpu": 10923.33, "total_tokens": 881235933 }, { "epoch": 0.5580145036259064, "grad_norm": 0.916065514087677, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8926, "tokens_per_second_per_gpu": 10680.12, "total_tokens": 881338246 }, { "epoch": 0.5580770192548137, "grad_norm": 1.1078957319259644, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8927, "tokens_per_second_per_gpu": 11245.55, "total_tokens": 881435719 }, { "epoch": 0.5581395348837209, "grad_norm": 0.9223042130470276, "learning_rate": 2e-05, "loss": 0.5846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8928, "tokens_per_second_per_gpu": 10275.02, "total_tokens": 881529864 }, { "epoch": 0.5582020505126282, "grad_norm": 0.8688248991966248, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8929, "tokens_per_second_per_gpu": 10917.92, "total_tokens": 881630666 }, { "epoch": 0.5582645661415354, "grad_norm": 0.8964966535568237, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8930, "tokens_per_second_per_gpu": 10089.37, "total_tokens": 881728251 }, { "epoch": 0.5583270817704427, "grad_norm": 0.9553126096725464, "learning_rate": 2e-05, "loss": 0.6374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8931, "tokens_per_second_per_gpu": 10699.0, "total_tokens": 881828570 }, { "epoch": 0.5583895973993498, "grad_norm": 1.2035439014434814, "learning_rate": 2e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8932, "tokens_per_second_per_gpu": 9619.26, "total_tokens": 881920713 }, { "epoch": 0.558452113028257, "grad_norm": 0.8984946012496948, "learning_rate": 2e-05, "loss": 0.6702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8933, "tokens_per_second_per_gpu": 10200.47, "total_tokens": 882021584 }, { "epoch": 0.5585146286571643, "grad_norm": 0.8696998357772827, "learning_rate": 2e-05, "loss": 0.6459, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8934, "tokens_per_second_per_gpu": 10458.38, "total_tokens": 882122632 }, { "epoch": 0.5585771442860715, "grad_norm": 0.8827133178710938, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8935, "tokens_per_second_per_gpu": 10327.33, "total_tokens": 882218379 }, { "epoch": 0.5586396599149788, "grad_norm": 0.908805787563324, "learning_rate": 2e-05, "loss": 0.6302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8936, "tokens_per_second_per_gpu": 10388.58, "total_tokens": 882315900 }, { "epoch": 0.558702175543886, "grad_norm": 0.9194384813308716, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8937, "tokens_per_second_per_gpu": 10523.44, "total_tokens": 882414048 }, { "epoch": 0.5587646911727931, "grad_norm": 0.8548731207847595, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8938, "tokens_per_second_per_gpu": 10736.2, "total_tokens": 882515397 }, { "epoch": 0.5588272068017004, "grad_norm": 0.8845183253288269, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8939, "tokens_per_second_per_gpu": 10275.88, "total_tokens": 882615676 }, { "epoch": 0.5588897224306076, "grad_norm": 0.8417638540267944, "learning_rate": 2e-05, "loss": 0.6014, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8940, "tokens_per_second_per_gpu": 10445.44, "total_tokens": 882713338 }, { "epoch": 0.5589522380595149, "grad_norm": 0.911968469619751, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8941, "tokens_per_second_per_gpu": 10313.35, "total_tokens": 882813893 }, { "epoch": 0.5590147536884221, "grad_norm": 0.8994210958480835, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8942, "tokens_per_second_per_gpu": 9800.28, "total_tokens": 882907017 }, { "epoch": 0.5590772693173294, "grad_norm": 0.8752309679985046, "learning_rate": 2e-05, "loss": 0.6536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8943, "tokens_per_second_per_gpu": 10952.86, "total_tokens": 883009841 }, { "epoch": 0.5591397849462365, "grad_norm": 0.8857231140136719, "learning_rate": 2e-05, "loss": 0.6614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8944, "tokens_per_second_per_gpu": 10384.0, "total_tokens": 883111402 }, { "epoch": 0.5592023005751438, "grad_norm": 0.8861830234527588, "learning_rate": 2e-05, "loss": 0.6146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8945, "tokens_per_second_per_gpu": 11117.86, "total_tokens": 883209867 }, { "epoch": 0.559264816204051, "grad_norm": 0.9137179255485535, "learning_rate": 2e-05, "loss": 0.67, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8946, "tokens_per_second_per_gpu": 10796.88, "total_tokens": 883309046 }, { "epoch": 0.5593273318329582, "grad_norm": 0.9190645813941956, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8947, "tokens_per_second_per_gpu": 10548.27, "total_tokens": 883405556 }, { "epoch": 0.5593898474618655, "grad_norm": 0.9319021701812744, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8948, "tokens_per_second_per_gpu": 11273.05, "total_tokens": 883506059 }, { "epoch": 0.5594523630907727, "grad_norm": 0.886062502861023, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8949, "tokens_per_second_per_gpu": 9828.41, "total_tokens": 883600556 }, { "epoch": 0.5595148787196799, "grad_norm": 0.8661903142929077, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8950, "tokens_per_second_per_gpu": 10894.34, "total_tokens": 883703557 }, { "epoch": 0.5595773943485871, "grad_norm": 0.9056621193885803, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8951, "tokens_per_second_per_gpu": 10201.01, "total_tokens": 883804480 }, { "epoch": 0.5596399099774944, "grad_norm": 0.8880151510238647, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8952, "tokens_per_second_per_gpu": 10134.46, "total_tokens": 883904859 }, { "epoch": 0.5597024256064016, "grad_norm": 0.8807510733604431, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8953, "tokens_per_second_per_gpu": 10452.92, "total_tokens": 884005215 }, { "epoch": 0.5597649412353088, "grad_norm": 0.9601285457611084, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8954, "tokens_per_second_per_gpu": 10848.43, "total_tokens": 884106900 }, { "epoch": 0.5598274568642161, "grad_norm": 0.9249861240386963, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8955, "tokens_per_second_per_gpu": 10814.18, "total_tokens": 884206982 }, { "epoch": 0.5598899724931233, "grad_norm": 0.9132299423217773, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8956, "tokens_per_second_per_gpu": 9781.39, "total_tokens": 884301566 }, { "epoch": 0.5599524881220305, "grad_norm": 0.8522695302963257, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8957, "tokens_per_second_per_gpu": 10894.17, "total_tokens": 884403781 }, { "epoch": 0.5600150037509377, "grad_norm": 0.857029139995575, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8958, "tokens_per_second_per_gpu": 10662.15, "total_tokens": 884503514 }, { "epoch": 0.560077519379845, "grad_norm": 0.9543066024780273, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8959, "tokens_per_second_per_gpu": 9751.56, "total_tokens": 884600318 }, { "epoch": 0.5601400350087522, "grad_norm": 0.9084818959236145, "learning_rate": 2e-05, "loss": 0.6781, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8960, "tokens_per_second_per_gpu": 11098.17, "total_tokens": 884699245 }, { "epoch": 0.5602025506376594, "grad_norm": 0.9348719120025635, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8961, "tokens_per_second_per_gpu": 10707.26, "total_tokens": 884798208 }, { "epoch": 0.5602650662665667, "grad_norm": 0.9138741493225098, "learning_rate": 2e-05, "loss": 0.6591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8962, "tokens_per_second_per_gpu": 10626.79, "total_tokens": 884896550 }, { "epoch": 0.5603275818954738, "grad_norm": 0.8630855679512024, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8963, "tokens_per_second_per_gpu": 11165.48, "total_tokens": 884996916 }, { "epoch": 0.5603900975243811, "grad_norm": 0.9105440378189087, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8964, "tokens_per_second_per_gpu": 10672.78, "total_tokens": 885094093 }, { "epoch": 0.5604526131532883, "grad_norm": 0.9050712585449219, "learning_rate": 2e-05, "loss": 0.6244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8965, "tokens_per_second_per_gpu": 10499.0, "total_tokens": 885192132 }, { "epoch": 0.5605151287821956, "grad_norm": 0.8660333752632141, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8966, "tokens_per_second_per_gpu": 10782.66, "total_tokens": 885295349 }, { "epoch": 0.5605776444111028, "grad_norm": 0.9056552648544312, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8967, "tokens_per_second_per_gpu": 10353.96, "total_tokens": 885395014 }, { "epoch": 0.56064016004001, "grad_norm": 0.8678483963012695, "learning_rate": 2e-05, "loss": 0.611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8968, "tokens_per_second_per_gpu": 10592.49, "total_tokens": 885494194 }, { "epoch": 0.5607026756689172, "grad_norm": 0.872728168964386, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8969, "tokens_per_second_per_gpu": 10522.88, "total_tokens": 885593487 }, { "epoch": 0.5607651912978244, "grad_norm": 0.9256542921066284, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8970, "tokens_per_second_per_gpu": 10623.76, "total_tokens": 885689524 }, { "epoch": 0.5608277069267317, "grad_norm": 0.933305025100708, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8971, "tokens_per_second_per_gpu": 10266.59, "total_tokens": 885788717 }, { "epoch": 0.5608902225556389, "grad_norm": 0.890998899936676, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8972, "tokens_per_second_per_gpu": 10892.98, "total_tokens": 885887112 }, { "epoch": 0.5609527381845462, "grad_norm": 0.9345607161521912, "learning_rate": 2e-05, "loss": 0.7087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8973, "tokens_per_second_per_gpu": 11094.48, "total_tokens": 885988329 }, { "epoch": 0.5610152538134534, "grad_norm": 0.9314256906509399, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8974, "tokens_per_second_per_gpu": 10372.76, "total_tokens": 886086232 }, { "epoch": 0.5610777694423605, "grad_norm": 0.9031565189361572, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8975, "tokens_per_second_per_gpu": 10645.46, "total_tokens": 886185240 }, { "epoch": 0.5611402850712678, "grad_norm": 0.9382085800170898, "learning_rate": 2e-05, "loss": 0.6763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8976, "tokens_per_second_per_gpu": 10405.28, "total_tokens": 886288507 }, { "epoch": 0.561202800700175, "grad_norm": 0.9104456901550293, "learning_rate": 2e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8977, "tokens_per_second_per_gpu": 10731.19, "total_tokens": 886387676 }, { "epoch": 0.5612653163290823, "grad_norm": 0.9319635033607483, "learning_rate": 2e-05, "loss": 0.6906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8978, "tokens_per_second_per_gpu": 9858.95, "total_tokens": 886484476 }, { "epoch": 0.5613278319579895, "grad_norm": 0.9347566366195679, "learning_rate": 2e-05, "loss": 0.6613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8979, "tokens_per_second_per_gpu": 10568.34, "total_tokens": 886583314 }, { "epoch": 0.5613903475868968, "grad_norm": 1.0006848573684692, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8980, "tokens_per_second_per_gpu": 10584.44, "total_tokens": 886680829 }, { "epoch": 0.5614528632158039, "grad_norm": 0.8773366808891296, "learning_rate": 2e-05, "loss": 0.677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8981, "tokens_per_second_per_gpu": 10648.68, "total_tokens": 886783461 }, { "epoch": 0.5615153788447111, "grad_norm": 0.9413611888885498, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8982, "tokens_per_second_per_gpu": 10875.56, "total_tokens": 886885530 }, { "epoch": 0.5615778944736184, "grad_norm": 0.9130365252494812, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8983, "tokens_per_second_per_gpu": 10835.23, "total_tokens": 886984981 }, { "epoch": 0.5616404101025256, "grad_norm": 0.9194187521934509, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8984, "tokens_per_second_per_gpu": 10168.49, "total_tokens": 887082961 }, { "epoch": 0.5617029257314329, "grad_norm": 0.8953165411949158, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8985, "tokens_per_second_per_gpu": 9645.31, "total_tokens": 887176785 }, { "epoch": 0.5617654413603401, "grad_norm": 0.9010658860206604, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8986, "tokens_per_second_per_gpu": 11215.96, "total_tokens": 887280698 }, { "epoch": 0.5618279569892473, "grad_norm": 0.9183212518692017, "learning_rate": 2e-05, "loss": 0.6703, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8987, "tokens_per_second_per_gpu": 11224.21, "total_tokens": 887383955 }, { "epoch": 0.5618904726181545, "grad_norm": 0.8830273151397705, "learning_rate": 2e-05, "loss": 0.6522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8988, "tokens_per_second_per_gpu": 11040.9, "total_tokens": 887489049 }, { "epoch": 0.5619529882470617, "grad_norm": 0.8697304725646973, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8989, "tokens_per_second_per_gpu": 11469.35, "total_tokens": 887590259 }, { "epoch": 0.562015503875969, "grad_norm": 0.9218939542770386, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8990, "tokens_per_second_per_gpu": 9906.3, "total_tokens": 887687623 }, { "epoch": 0.5620780195048762, "grad_norm": 0.9155179858207703, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8991, "tokens_per_second_per_gpu": 10268.85, "total_tokens": 887787964 }, { "epoch": 0.5621405351337835, "grad_norm": 0.912986695766449, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8992, "tokens_per_second_per_gpu": 9578.18, "total_tokens": 887882344 }, { "epoch": 0.5622030507626907, "grad_norm": 0.8857983350753784, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8993, "tokens_per_second_per_gpu": 11069.34, "total_tokens": 887985472 }, { "epoch": 0.5622655663915979, "grad_norm": 0.9446619749069214, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8994, "tokens_per_second_per_gpu": 11333.23, "total_tokens": 888084416 }, { "epoch": 0.5623280820205051, "grad_norm": 0.8939265012741089, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8995, "tokens_per_second_per_gpu": 10765.46, "total_tokens": 888184202 }, { "epoch": 0.5623905976494123, "grad_norm": 0.9036610126495361, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8996, "tokens_per_second_per_gpu": 10181.82, "total_tokens": 888283281 }, { "epoch": 0.5624531132783196, "grad_norm": 0.8955302834510803, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8997, "tokens_per_second_per_gpu": 10432.26, "total_tokens": 888381542 }, { "epoch": 0.5625156289072268, "grad_norm": 0.890619158744812, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8998, "tokens_per_second_per_gpu": 11157.21, "total_tokens": 888484973 }, { "epoch": 0.5625781445361341, "grad_norm": 0.9133090376853943, "learning_rate": 2e-05, "loss": 0.6729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 8999, "tokens_per_second_per_gpu": 10988.89, "total_tokens": 888585749 }, { "epoch": 0.5626406601650412, "grad_norm": 0.9422465562820435, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9000, "tokens_per_second_per_gpu": 10487.86, "total_tokens": 888680844 }, { "epoch": 0.5627031757939485, "grad_norm": 0.9137200117111206, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9001, "tokens_per_second_per_gpu": 10737.87, "total_tokens": 888780868 }, { "epoch": 0.5627656914228557, "grad_norm": 0.9122173190116882, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9002, "tokens_per_second_per_gpu": 8989.82, "total_tokens": 888872834 }, { "epoch": 0.562828207051763, "grad_norm": 0.88199782371521, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9003, "tokens_per_second_per_gpu": 11185.6, "total_tokens": 888975868 }, { "epoch": 0.5628907226806702, "grad_norm": 0.8972648978233337, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9004, "tokens_per_second_per_gpu": 9702.55, "total_tokens": 889073917 }, { "epoch": 0.5629532383095774, "grad_norm": 0.8724454045295715, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9005, "tokens_per_second_per_gpu": 10633.38, "total_tokens": 889171781 }, { "epoch": 0.5630157539384846, "grad_norm": 0.8880794644355774, "learning_rate": 2e-05, "loss": 0.5788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9006, "tokens_per_second_per_gpu": 9267.24, "total_tokens": 889266484 }, { "epoch": 0.5630782695673918, "grad_norm": 0.8920084238052368, "learning_rate": 2e-05, "loss": 0.6394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9007, "tokens_per_second_per_gpu": 10070.46, "total_tokens": 889362806 }, { "epoch": 0.5631407851962991, "grad_norm": 0.8461345434188843, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9008, "tokens_per_second_per_gpu": 10217.63, "total_tokens": 889464031 }, { "epoch": 0.5632033008252063, "grad_norm": 0.9006032347679138, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9009, "tokens_per_second_per_gpu": 10291.96, "total_tokens": 889565998 }, { "epoch": 0.5632658164541136, "grad_norm": 0.8805161714553833, "learning_rate": 2e-05, "loss": 0.6467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9010, "tokens_per_second_per_gpu": 10231.47, "total_tokens": 889663469 }, { "epoch": 0.5633283320830208, "grad_norm": 0.8612461686134338, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9011, "tokens_per_second_per_gpu": 10083.96, "total_tokens": 889760326 }, { "epoch": 0.5633908477119279, "grad_norm": 0.8599788546562195, "learning_rate": 2e-05, "loss": 0.6688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9012, "tokens_per_second_per_gpu": 10769.69, "total_tokens": 889862157 }, { "epoch": 0.5634533633408352, "grad_norm": 0.8911135792732239, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9013, "tokens_per_second_per_gpu": 10702.98, "total_tokens": 889963757 }, { "epoch": 0.5635158789697424, "grad_norm": 0.8627402782440186, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9014, "tokens_per_second_per_gpu": 10189.55, "total_tokens": 890064709 }, { "epoch": 0.5635783945986497, "grad_norm": 0.8943480849266052, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9015, "tokens_per_second_per_gpu": 9748.76, "total_tokens": 890159599 }, { "epoch": 0.5636409102275569, "grad_norm": 0.8971701860427856, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9016, "tokens_per_second_per_gpu": 11196.74, "total_tokens": 890261435 }, { "epoch": 0.5637034258564642, "grad_norm": 0.8836543560028076, "learning_rate": 2e-05, "loss": 0.6832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9017, "tokens_per_second_per_gpu": 11397.08, "total_tokens": 890363642 }, { "epoch": 0.5637659414853713, "grad_norm": 0.9254997372627258, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9018, "tokens_per_second_per_gpu": 10443.27, "total_tokens": 890459217 }, { "epoch": 0.5638284571142785, "grad_norm": 0.8569828271865845, "learning_rate": 2e-05, "loss": 0.6302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9019, "tokens_per_second_per_gpu": 10809.17, "total_tokens": 890563809 }, { "epoch": 0.5638909727431858, "grad_norm": 0.8890528678894043, "learning_rate": 2e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9020, "tokens_per_second_per_gpu": 10113.88, "total_tokens": 890662678 }, { "epoch": 0.563953488372093, "grad_norm": 0.8940821290016174, "learning_rate": 2e-05, "loss": 0.6132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9021, "tokens_per_second_per_gpu": 10780.87, "total_tokens": 890760439 }, { "epoch": 0.5640160040010003, "grad_norm": 0.8724679350852966, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9022, "tokens_per_second_per_gpu": 10969.07, "total_tokens": 890860648 }, { "epoch": 0.5640785196299075, "grad_norm": 0.8975785374641418, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9023, "tokens_per_second_per_gpu": 10370.16, "total_tokens": 890961109 }, { "epoch": 0.5641410352588146, "grad_norm": 0.8639355301856995, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9024, "tokens_per_second_per_gpu": 10933.41, "total_tokens": 891062989 }, { "epoch": 0.5642035508877219, "grad_norm": 0.8525106906890869, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9025, "tokens_per_second_per_gpu": 11370.02, "total_tokens": 891164225 }, { "epoch": 0.5642660665166291, "grad_norm": 0.8792645335197449, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9026, "tokens_per_second_per_gpu": 10663.65, "total_tokens": 891264841 }, { "epoch": 0.5643285821455364, "grad_norm": 0.8678024411201477, "learning_rate": 2e-05, "loss": 0.6574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9027, "tokens_per_second_per_gpu": 10342.51, "total_tokens": 891368528 }, { "epoch": 0.5643910977744436, "grad_norm": 0.8620607852935791, "learning_rate": 2e-05, "loss": 0.6302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9028, "tokens_per_second_per_gpu": 10087.39, "total_tokens": 891466086 }, { "epoch": 0.5644536134033509, "grad_norm": 0.9012091755867004, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9029, "tokens_per_second_per_gpu": 10205.5, "total_tokens": 891567158 }, { "epoch": 0.5645161290322581, "grad_norm": 0.9059594869613647, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9030, "tokens_per_second_per_gpu": 10546.2, "total_tokens": 891665650 }, { "epoch": 0.5645786446611653, "grad_norm": 0.9471845626831055, "learning_rate": 2e-05, "loss": 0.6435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9031, "tokens_per_second_per_gpu": 10813.17, "total_tokens": 891764723 }, { "epoch": 0.5646411602900725, "grad_norm": 0.898188054561615, "learning_rate": 2e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9032, "tokens_per_second_per_gpu": 10920.5, "total_tokens": 891863625 }, { "epoch": 0.5647036759189797, "grad_norm": 0.9220165610313416, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9033, "tokens_per_second_per_gpu": 10189.54, "total_tokens": 891957776 }, { "epoch": 0.564766191547887, "grad_norm": 0.9127065539360046, "learning_rate": 2e-05, "loss": 0.6659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9034, "tokens_per_second_per_gpu": 10220.62, "total_tokens": 892057043 }, { "epoch": 0.5648287071767942, "grad_norm": 0.8480685353279114, "learning_rate": 2e-05, "loss": 0.6168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9035, "tokens_per_second_per_gpu": 15144.71, "total_tokens": 892159903 }, { "epoch": 0.5648912228057015, "grad_norm": 0.8964866995811462, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9036, "tokens_per_second_per_gpu": 10431.8, "total_tokens": 892258558 }, { "epoch": 0.5649537384346086, "grad_norm": 0.8766170740127563, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9037, "tokens_per_second_per_gpu": 10354.43, "total_tokens": 892356642 }, { "epoch": 0.5650162540635159, "grad_norm": 0.8963708281517029, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9038, "tokens_per_second_per_gpu": 10390.07, "total_tokens": 892457036 }, { "epoch": 0.5650787696924231, "grad_norm": 0.8538951873779297, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9039, "tokens_per_second_per_gpu": 10599.44, "total_tokens": 892559223 }, { "epoch": 0.5651412853213303, "grad_norm": 1.0517851114273071, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9040, "tokens_per_second_per_gpu": 10653.38, "total_tokens": 892653485 }, { "epoch": 0.5652038009502376, "grad_norm": 0.8520570993423462, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9041, "tokens_per_second_per_gpu": 10404.51, "total_tokens": 892750983 }, { "epoch": 0.5652663165791448, "grad_norm": 0.9667475819587708, "learning_rate": 2e-05, "loss": 0.6937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9042, "tokens_per_second_per_gpu": 9468.92, "total_tokens": 892850707 }, { "epoch": 0.565328832208052, "grad_norm": 0.8658114075660706, "learning_rate": 2e-05, "loss": 0.6549, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9043, "tokens_per_second_per_gpu": 10859.15, "total_tokens": 892953879 }, { "epoch": 0.5653913478369592, "grad_norm": 0.9553476572036743, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9044, "tokens_per_second_per_gpu": 10930.77, "total_tokens": 893058025 }, { "epoch": 0.5654538634658665, "grad_norm": 0.8927204608917236, "learning_rate": 2e-05, "loss": 0.7041, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9045, "tokens_per_second_per_gpu": 10672.15, "total_tokens": 893161826 }, { "epoch": 0.5655163790947737, "grad_norm": 0.8981958031654358, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9046, "tokens_per_second_per_gpu": 10630.97, "total_tokens": 893260510 }, { "epoch": 0.565578894723681, "grad_norm": 0.8781256079673767, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9047, "tokens_per_second_per_gpu": 10231.67, "total_tokens": 893358158 }, { "epoch": 0.5656414103525882, "grad_norm": 0.8755016326904297, "learning_rate": 2e-05, "loss": 0.5759, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9048, "tokens_per_second_per_gpu": 9623.2, "total_tokens": 893454240 }, { "epoch": 0.5657039259814953, "grad_norm": 0.9124635457992554, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9049, "tokens_per_second_per_gpu": 10506.36, "total_tokens": 893553055 }, { "epoch": 0.5657664416104026, "grad_norm": 0.9189612865447998, "learning_rate": 2e-05, "loss": 0.691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9050, "tokens_per_second_per_gpu": 10883.68, "total_tokens": 893658651 }, { "epoch": 0.5658289572393098, "grad_norm": 0.8508475422859192, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9051, "tokens_per_second_per_gpu": 10773.78, "total_tokens": 893763156 }, { "epoch": 0.5658914728682171, "grad_norm": 0.8879354596138, "learning_rate": 2e-05, "loss": 0.6237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9052, "tokens_per_second_per_gpu": 11089.18, "total_tokens": 893859860 }, { "epoch": 0.5659539884971243, "grad_norm": 0.9131381511688232, "learning_rate": 2e-05, "loss": 0.6852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9053, "tokens_per_second_per_gpu": 10832.14, "total_tokens": 893962255 }, { "epoch": 0.5660165041260315, "grad_norm": 0.90041583776474, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9054, "tokens_per_second_per_gpu": 10427.93, "total_tokens": 894065066 }, { "epoch": 0.5660790197549387, "grad_norm": 0.8708183765411377, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9055, "tokens_per_second_per_gpu": 10856.21, "total_tokens": 894165893 }, { "epoch": 0.5661415353838459, "grad_norm": 0.8888447880744934, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9056, "tokens_per_second_per_gpu": 10470.89, "total_tokens": 894268546 }, { "epoch": 0.5662040510127532, "grad_norm": 0.9122999310493469, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9057, "tokens_per_second_per_gpu": 10146.02, "total_tokens": 894366938 }, { "epoch": 0.5662665666416604, "grad_norm": 0.9127549529075623, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9058, "tokens_per_second_per_gpu": 10004.87, "total_tokens": 894461243 }, { "epoch": 0.5663290822705677, "grad_norm": 0.9129006266593933, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9059, "tokens_per_second_per_gpu": 11052.12, "total_tokens": 894562367 }, { "epoch": 0.5663915978994749, "grad_norm": 0.8797143697738647, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9060, "tokens_per_second_per_gpu": 10172.92, "total_tokens": 894657539 }, { "epoch": 0.566454113528382, "grad_norm": 0.878304123878479, "learning_rate": 2e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9061, "tokens_per_second_per_gpu": 10140.54, "total_tokens": 894757908 }, { "epoch": 0.5665166291572893, "grad_norm": 0.9264856576919556, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9062, "tokens_per_second_per_gpu": 10482.65, "total_tokens": 894857171 }, { "epoch": 0.5665791447861965, "grad_norm": 0.9284349679946899, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9063, "tokens_per_second_per_gpu": 10663.44, "total_tokens": 894954155 }, { "epoch": 0.5666416604151038, "grad_norm": 0.9187344908714294, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9064, "tokens_per_second_per_gpu": 10655.32, "total_tokens": 895053426 }, { "epoch": 0.566704176044011, "grad_norm": 0.8465887904167175, "learning_rate": 2e-05, "loss": 0.5886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9065, "tokens_per_second_per_gpu": 9831.34, "total_tokens": 895150895 }, { "epoch": 0.5667666916729183, "grad_norm": 0.8932048678398132, "learning_rate": 2e-05, "loss": 0.6879, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9066, "tokens_per_second_per_gpu": 11002.23, "total_tokens": 895253341 }, { "epoch": 0.5668292073018255, "grad_norm": 0.9561148285865784, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9067, "tokens_per_second_per_gpu": 10844.3, "total_tokens": 895350886 }, { "epoch": 0.5668917229307326, "grad_norm": 0.8660605549812317, "learning_rate": 2e-05, "loss": 0.6577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9068, "tokens_per_second_per_gpu": 10275.19, "total_tokens": 895450195 }, { "epoch": 0.5669542385596399, "grad_norm": 0.9397193789482117, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9069, "tokens_per_second_per_gpu": 10798.28, "total_tokens": 895548620 }, { "epoch": 0.5670167541885471, "grad_norm": 0.8947558999061584, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9070, "tokens_per_second_per_gpu": 10374.31, "total_tokens": 895650502 }, { "epoch": 0.5670792698174544, "grad_norm": 0.9347788691520691, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9071, "tokens_per_second_per_gpu": 10575.92, "total_tokens": 895751634 }, { "epoch": 0.5671417854463616, "grad_norm": 0.9260942339897156, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9072, "tokens_per_second_per_gpu": 10724.94, "total_tokens": 895850303 }, { "epoch": 0.5672043010752689, "grad_norm": 0.8922328948974609, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9073, "tokens_per_second_per_gpu": 11309.94, "total_tokens": 895953100 }, { "epoch": 0.567266816704176, "grad_norm": 0.8409792184829712, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9074, "tokens_per_second_per_gpu": 10728.98, "total_tokens": 896056065 }, { "epoch": 0.5673293323330832, "grad_norm": 0.8558197021484375, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9075, "tokens_per_second_per_gpu": 11176.53, "total_tokens": 896154991 }, { "epoch": 0.5673918479619905, "grad_norm": 0.8983711004257202, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9076, "tokens_per_second_per_gpu": 10195.05, "total_tokens": 896251886 }, { "epoch": 0.5674543635908977, "grad_norm": 0.8588225841522217, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9077, "tokens_per_second_per_gpu": 10178.28, "total_tokens": 896351776 }, { "epoch": 0.567516879219805, "grad_norm": 0.8845319747924805, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9078, "tokens_per_second_per_gpu": 11030.53, "total_tokens": 896455459 }, { "epoch": 0.5675793948487122, "grad_norm": 0.9117751717567444, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9079, "tokens_per_second_per_gpu": 10549.96, "total_tokens": 896555122 }, { "epoch": 0.5676419104776194, "grad_norm": 0.884696900844574, "learning_rate": 2e-05, "loss": 0.6002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9080, "tokens_per_second_per_gpu": 10644.01, "total_tokens": 896655920 }, { "epoch": 0.5677044261065266, "grad_norm": 0.8876227140426636, "learning_rate": 2e-05, "loss": 0.6569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9081, "tokens_per_second_per_gpu": 10780.6, "total_tokens": 896753399 }, { "epoch": 0.5677669417354338, "grad_norm": 0.9047763347625732, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9082, "tokens_per_second_per_gpu": 10422.89, "total_tokens": 896848409 }, { "epoch": 0.5678294573643411, "grad_norm": 0.900680661201477, "learning_rate": 2e-05, "loss": 0.562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9083, "tokens_per_second_per_gpu": 10329.86, "total_tokens": 896945039 }, { "epoch": 0.5678919729932483, "grad_norm": 0.8742714524269104, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9084, "tokens_per_second_per_gpu": 10433.74, "total_tokens": 897041930 }, { "epoch": 0.5679544886221556, "grad_norm": 0.9401944279670715, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9085, "tokens_per_second_per_gpu": 10160.03, "total_tokens": 897135930 }, { "epoch": 0.5680170042510627, "grad_norm": 0.8814311027526855, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9086, "tokens_per_second_per_gpu": 10079.1, "total_tokens": 897235282 }, { "epoch": 0.56807951987997, "grad_norm": 0.8775532841682434, "learning_rate": 2e-05, "loss": 0.5884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9087, "tokens_per_second_per_gpu": 10046.91, "total_tokens": 897330770 }, { "epoch": 0.5681420355088772, "grad_norm": 0.9045752882957458, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9088, "tokens_per_second_per_gpu": 9918.63, "total_tokens": 897428900 }, { "epoch": 0.5682045511377845, "grad_norm": 0.8549927473068237, "learning_rate": 2e-05, "loss": 0.5973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9089, "tokens_per_second_per_gpu": 9935.49, "total_tokens": 897528074 }, { "epoch": 0.5682670667666917, "grad_norm": 0.864490807056427, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9090, "tokens_per_second_per_gpu": 10761.47, "total_tokens": 897626303 }, { "epoch": 0.5683295823955989, "grad_norm": 0.8619561195373535, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9091, "tokens_per_second_per_gpu": 11037.28, "total_tokens": 897727797 }, { "epoch": 0.5683920980245061, "grad_norm": 0.9188150763511658, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9092, "tokens_per_second_per_gpu": 10660.73, "total_tokens": 897825398 }, { "epoch": 0.5684546136534133, "grad_norm": 0.8719335794448853, "learning_rate": 2e-05, "loss": 0.6819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9093, "tokens_per_second_per_gpu": 10948.7, "total_tokens": 897926087 }, { "epoch": 0.5685171292823206, "grad_norm": 0.8820760846138, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9094, "tokens_per_second_per_gpu": 10051.9, "total_tokens": 898022788 }, { "epoch": 0.5685796449112278, "grad_norm": 0.8762052059173584, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9095, "tokens_per_second_per_gpu": 10785.49, "total_tokens": 898122364 }, { "epoch": 0.568642160540135, "grad_norm": 0.857185959815979, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9096, "tokens_per_second_per_gpu": 11591.03, "total_tokens": 898227239 }, { "epoch": 0.5687046761690423, "grad_norm": 0.9084821343421936, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9097, "tokens_per_second_per_gpu": 10056.96, "total_tokens": 898326435 }, { "epoch": 0.5687671917979494, "grad_norm": 0.8674421310424805, "learning_rate": 2e-05, "loss": 0.5753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9098, "tokens_per_second_per_gpu": 9649.73, "total_tokens": 898416683 }, { "epoch": 0.5688297074268567, "grad_norm": 0.8667445182800293, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9099, "tokens_per_second_per_gpu": 10637.08, "total_tokens": 898517317 }, { "epoch": 0.5688922230557639, "grad_norm": 0.9103919267654419, "learning_rate": 2e-05, "loss": 0.6544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9100, "tokens_per_second_per_gpu": 10433.38, "total_tokens": 898615698 }, { "epoch": 0.5689547386846712, "grad_norm": 0.8917847275733948, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9101, "tokens_per_second_per_gpu": 10431.12, "total_tokens": 898715000 }, { "epoch": 0.5690172543135784, "grad_norm": 0.8873295783996582, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9102, "tokens_per_second_per_gpu": 10584.4, "total_tokens": 898816554 }, { "epoch": 0.5690797699424857, "grad_norm": 0.8959203362464905, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9103, "tokens_per_second_per_gpu": 10493.87, "total_tokens": 898912729 }, { "epoch": 0.5691422855713929, "grad_norm": 0.8440837264060974, "learning_rate": 2e-05, "loss": 0.6422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9104, "tokens_per_second_per_gpu": 11264.03, "total_tokens": 899017806 }, { "epoch": 0.5692048012003, "grad_norm": 0.9093919992446899, "learning_rate": 2e-05, "loss": 0.6581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9105, "tokens_per_second_per_gpu": 10953.23, "total_tokens": 899117183 }, { "epoch": 0.5692673168292073, "grad_norm": 0.8852201700210571, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9106, "tokens_per_second_per_gpu": 10809.69, "total_tokens": 899217763 }, { "epoch": 0.5693298324581145, "grad_norm": 0.8751976490020752, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9107, "tokens_per_second_per_gpu": 10030.75, "total_tokens": 899316038 }, { "epoch": 0.5693923480870218, "grad_norm": 0.8849359154701233, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9108, "tokens_per_second_per_gpu": 10375.14, "total_tokens": 899413628 }, { "epoch": 0.569454863715929, "grad_norm": 0.9132274985313416, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9109, "tokens_per_second_per_gpu": 10420.0, "total_tokens": 899510201 }, { "epoch": 0.5695173793448363, "grad_norm": 0.907465934753418, "learning_rate": 2e-05, "loss": 0.5912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9110, "tokens_per_second_per_gpu": 9747.74, "total_tokens": 899606192 }, { "epoch": 0.5695798949737434, "grad_norm": 0.8652750253677368, "learning_rate": 2e-05, "loss": 0.6294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9111, "tokens_per_second_per_gpu": 10110.64, "total_tokens": 899703756 }, { "epoch": 0.5696424106026506, "grad_norm": 0.8557974100112915, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9112, "tokens_per_second_per_gpu": 10438.75, "total_tokens": 899799480 }, { "epoch": 0.5697049262315579, "grad_norm": 0.8943153619766235, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9113, "tokens_per_second_per_gpu": 10777.2, "total_tokens": 899898970 }, { "epoch": 0.5697674418604651, "grad_norm": 0.8933148980140686, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9114, "tokens_per_second_per_gpu": 10173.21, "total_tokens": 899998103 }, { "epoch": 0.5698299574893724, "grad_norm": 0.9043829441070557, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9115, "tokens_per_second_per_gpu": 10611.14, "total_tokens": 900100165 }, { "epoch": 0.5698924731182796, "grad_norm": 0.8871011137962341, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9116, "tokens_per_second_per_gpu": 10842.43, "total_tokens": 900198791 }, { "epoch": 0.5699549887471868, "grad_norm": 0.883100152015686, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9117, "tokens_per_second_per_gpu": 11416.12, "total_tokens": 900297686 }, { "epoch": 0.570017504376094, "grad_norm": 0.9124422073364258, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9118, "tokens_per_second_per_gpu": 10406.31, "total_tokens": 900396239 }, { "epoch": 0.5700800200050012, "grad_norm": 0.9308980703353882, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9119, "tokens_per_second_per_gpu": 10307.93, "total_tokens": 900492400 }, { "epoch": 0.5701425356339085, "grad_norm": 0.8534010052680969, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9120, "tokens_per_second_per_gpu": 10430.41, "total_tokens": 900594041 }, { "epoch": 0.5702050512628157, "grad_norm": 0.8793431520462036, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9121, "tokens_per_second_per_gpu": 10763.75, "total_tokens": 900694295 }, { "epoch": 0.570267566891723, "grad_norm": 0.860775351524353, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9122, "tokens_per_second_per_gpu": 10407.5, "total_tokens": 900791640 }, { "epoch": 0.5703300825206301, "grad_norm": 0.9209607839584351, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9123, "tokens_per_second_per_gpu": 10450.46, "total_tokens": 900892798 }, { "epoch": 0.5703925981495374, "grad_norm": 0.900334894657135, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9124, "tokens_per_second_per_gpu": 10536.27, "total_tokens": 900995217 }, { "epoch": 0.5704551137784446, "grad_norm": 0.8890225887298584, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9125, "tokens_per_second_per_gpu": 11037.19, "total_tokens": 901095389 }, { "epoch": 0.5705176294073518, "grad_norm": 0.8706218600273132, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9126, "tokens_per_second_per_gpu": 10529.08, "total_tokens": 901194246 }, { "epoch": 0.5705801450362591, "grad_norm": 0.8901675939559937, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9127, "tokens_per_second_per_gpu": 10709.54, "total_tokens": 901294864 }, { "epoch": 0.5706426606651663, "grad_norm": 0.8959726691246033, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9128, "tokens_per_second_per_gpu": 10274.49, "total_tokens": 901389867 }, { "epoch": 0.5707051762940735, "grad_norm": 0.8964621424674988, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9129, "tokens_per_second_per_gpu": 10518.01, "total_tokens": 901489633 }, { "epoch": 0.5707676919229807, "grad_norm": 0.8683385252952576, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9130, "tokens_per_second_per_gpu": 10520.91, "total_tokens": 901592258 }, { "epoch": 0.570830207551888, "grad_norm": 1.955098271369934, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9131, "tokens_per_second_per_gpu": 9936.15, "total_tokens": 901690321 }, { "epoch": 0.5708927231807952, "grad_norm": 0.9237292408943176, "learning_rate": 2e-05, "loss": 0.6544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9132, "tokens_per_second_per_gpu": 10405.41, "total_tokens": 901791660 }, { "epoch": 0.5709552388097024, "grad_norm": 0.8830103278160095, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9133, "tokens_per_second_per_gpu": 11202.69, "total_tokens": 901892315 }, { "epoch": 0.5710177544386097, "grad_norm": 0.8871188759803772, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9134, "tokens_per_second_per_gpu": 10527.49, "total_tokens": 901991244 }, { "epoch": 0.5710802700675168, "grad_norm": 0.923163115978241, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9135, "tokens_per_second_per_gpu": 10101.49, "total_tokens": 902088291 }, { "epoch": 0.5711427856964241, "grad_norm": 0.87306147813797, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9136, "tokens_per_second_per_gpu": 11003.31, "total_tokens": 902187870 }, { "epoch": 0.5712053013253313, "grad_norm": 0.8574735522270203, "learning_rate": 2e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9137, "tokens_per_second_per_gpu": 9887.55, "total_tokens": 902287596 }, { "epoch": 0.5712678169542386, "grad_norm": 0.9344844818115234, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9138, "tokens_per_second_per_gpu": 10282.13, "total_tokens": 902385879 }, { "epoch": 0.5713303325831458, "grad_norm": 0.8965919017791748, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9139, "tokens_per_second_per_gpu": 10884.36, "total_tokens": 902485895 }, { "epoch": 0.571392848212053, "grad_norm": 0.8864029049873352, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9140, "tokens_per_second_per_gpu": 10726.01, "total_tokens": 902580332 }, { "epoch": 0.5714553638409603, "grad_norm": 0.900540828704834, "learning_rate": 2e-05, "loss": 0.6842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9141, "tokens_per_second_per_gpu": 10508.0, "total_tokens": 902678708 }, { "epoch": 0.5715178794698674, "grad_norm": 0.8904759883880615, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9142, "tokens_per_second_per_gpu": 10740.37, "total_tokens": 902779247 }, { "epoch": 0.5715803950987747, "grad_norm": 0.8841188549995422, "learning_rate": 2e-05, "loss": 0.6455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9143, "tokens_per_second_per_gpu": 10597.87, "total_tokens": 902878366 }, { "epoch": 0.5716429107276819, "grad_norm": 0.9167156219482422, "learning_rate": 2e-05, "loss": 0.6729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9144, "tokens_per_second_per_gpu": 11131.02, "total_tokens": 902977188 }, { "epoch": 0.5717054263565892, "grad_norm": 0.9287437796592712, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9145, "tokens_per_second_per_gpu": 10101.11, "total_tokens": 903074530 }, { "epoch": 0.5717679419854964, "grad_norm": 0.9020741581916809, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9146, "tokens_per_second_per_gpu": 11368.28, "total_tokens": 903177655 }, { "epoch": 0.5718304576144037, "grad_norm": 0.8834019899368286, "learning_rate": 2e-05, "loss": 0.6516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9147, "tokens_per_second_per_gpu": 10638.08, "total_tokens": 903279379 }, { "epoch": 0.5718929732433108, "grad_norm": 0.9030375480651855, "learning_rate": 2e-05, "loss": 0.5903, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9148, "tokens_per_second_per_gpu": 9878.47, "total_tokens": 903370957 }, { "epoch": 0.571955488872218, "grad_norm": 0.8887526392936707, "learning_rate": 2e-05, "loss": 0.6588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9149, "tokens_per_second_per_gpu": 10569.82, "total_tokens": 903470439 }, { "epoch": 0.5720180045011253, "grad_norm": 0.897548258304596, "learning_rate": 2e-05, "loss": 0.6457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9150, "tokens_per_second_per_gpu": 10503.34, "total_tokens": 903564566 }, { "epoch": 0.5720805201300325, "grad_norm": 0.9131994843482971, "learning_rate": 2e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9151, "tokens_per_second_per_gpu": 11348.19, "total_tokens": 903666175 }, { "epoch": 0.5721430357589398, "grad_norm": 0.9011214375495911, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9152, "tokens_per_second_per_gpu": 10437.82, "total_tokens": 903765771 }, { "epoch": 0.572205551387847, "grad_norm": 0.8864151835441589, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9153, "tokens_per_second_per_gpu": 10258.69, "total_tokens": 903865713 }, { "epoch": 0.5722680670167541, "grad_norm": 0.9597684741020203, "learning_rate": 2e-05, "loss": 0.606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9154, "tokens_per_second_per_gpu": 9901.3, "total_tokens": 903957279 }, { "epoch": 0.5723305826456614, "grad_norm": 0.8327551484107971, "learning_rate": 2e-05, "loss": 0.5816, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9155, "tokens_per_second_per_gpu": 10088.09, "total_tokens": 904054897 }, { "epoch": 0.5723930982745686, "grad_norm": 0.8813408017158508, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9156, "tokens_per_second_per_gpu": 9535.88, "total_tokens": 904151068 }, { "epoch": 0.5724556139034759, "grad_norm": 0.9072434306144714, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9157, "tokens_per_second_per_gpu": 10444.67, "total_tokens": 904251883 }, { "epoch": 0.5725181295323831, "grad_norm": 0.8838159441947937, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9158, "tokens_per_second_per_gpu": 10815.17, "total_tokens": 904348589 }, { "epoch": 0.5725806451612904, "grad_norm": 0.9197668433189392, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9159, "tokens_per_second_per_gpu": 10082.1, "total_tokens": 904444121 }, { "epoch": 0.5726431607901975, "grad_norm": 0.8963351249694824, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9160, "tokens_per_second_per_gpu": 10856.47, "total_tokens": 904542418 }, { "epoch": 0.5727056764191047, "grad_norm": 0.8868871927261353, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9161, "tokens_per_second_per_gpu": 10137.47, "total_tokens": 904639117 }, { "epoch": 0.572768192048012, "grad_norm": 0.8888401389122009, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9162, "tokens_per_second_per_gpu": 10693.23, "total_tokens": 904740802 }, { "epoch": 0.5728307076769192, "grad_norm": 0.8561129570007324, "learning_rate": 2e-05, "loss": 0.5884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9163, "tokens_per_second_per_gpu": 10614.06, "total_tokens": 904841628 }, { "epoch": 0.5728932233058265, "grad_norm": 0.8762792348861694, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9164, "tokens_per_second_per_gpu": 10095.43, "total_tokens": 904938314 }, { "epoch": 0.5729557389347337, "grad_norm": 0.9297592043876648, "learning_rate": 2e-05, "loss": 0.6989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9165, "tokens_per_second_per_gpu": 10525.7, "total_tokens": 905037669 }, { "epoch": 0.5730182545636409, "grad_norm": 0.8967973589897156, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9166, "tokens_per_second_per_gpu": 10677.71, "total_tokens": 905136387 }, { "epoch": 0.5730807701925481, "grad_norm": 0.9677903056144714, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9167, "tokens_per_second_per_gpu": 10248.88, "total_tokens": 905228900 }, { "epoch": 0.5731432858214554, "grad_norm": 0.9221057295799255, "learning_rate": 2e-05, "loss": 0.6738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9168, "tokens_per_second_per_gpu": 10457.46, "total_tokens": 905326162 }, { "epoch": 0.5732058014503626, "grad_norm": 0.877747118473053, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9169, "tokens_per_second_per_gpu": 10405.38, "total_tokens": 905428136 }, { "epoch": 0.5732683170792698, "grad_norm": 0.9328575134277344, "learning_rate": 2e-05, "loss": 0.6896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9170, "tokens_per_second_per_gpu": 11465.21, "total_tokens": 905529291 }, { "epoch": 0.5733308327081771, "grad_norm": 0.8842630982398987, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9171, "tokens_per_second_per_gpu": 9973.08, "total_tokens": 905626993 }, { "epoch": 0.5733933483370842, "grad_norm": 0.8818732500076294, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9172, "tokens_per_second_per_gpu": 9990.81, "total_tokens": 905723200 }, { "epoch": 0.5734558639659915, "grad_norm": 0.8878241181373596, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9173, "tokens_per_second_per_gpu": 10506.71, "total_tokens": 905820779 }, { "epoch": 0.5735183795948987, "grad_norm": 0.8940924406051636, "learning_rate": 2e-05, "loss": 0.6904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9174, "tokens_per_second_per_gpu": 10748.93, "total_tokens": 905922135 }, { "epoch": 0.573580895223806, "grad_norm": 0.8812363743782043, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9175, "tokens_per_second_per_gpu": 9832.89, "total_tokens": 906018332 }, { "epoch": 0.5736434108527132, "grad_norm": 0.9548799991607666, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9176, "tokens_per_second_per_gpu": 10635.83, "total_tokens": 906118364 }, { "epoch": 0.5737059264816204, "grad_norm": 0.9356949925422668, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9177, "tokens_per_second_per_gpu": 11073.69, "total_tokens": 906216611 }, { "epoch": 0.5737684421105276, "grad_norm": 0.9259135127067566, "learning_rate": 2e-05, "loss": 0.6633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9178, "tokens_per_second_per_gpu": 10005.74, "total_tokens": 906315214 }, { "epoch": 0.5738309577394348, "grad_norm": 0.8985815048217773, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9179, "tokens_per_second_per_gpu": 10447.52, "total_tokens": 906414676 }, { "epoch": 0.5738934733683421, "grad_norm": 0.8866357803344727, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9180, "tokens_per_second_per_gpu": 10201.5, "total_tokens": 906512329 }, { "epoch": 0.5739559889972493, "grad_norm": 0.8539925813674927, "learning_rate": 2e-05, "loss": 0.5837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9181, "tokens_per_second_per_gpu": 10447.81, "total_tokens": 906609445 }, { "epoch": 0.5740185046261566, "grad_norm": 0.8804162740707397, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9182, "tokens_per_second_per_gpu": 10502.86, "total_tokens": 906709634 }, { "epoch": 0.5740810202550638, "grad_norm": 0.8847696185112, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9183, "tokens_per_second_per_gpu": 11044.02, "total_tokens": 906808088 }, { "epoch": 0.574143535883971, "grad_norm": 0.8489985466003418, "learning_rate": 2e-05, "loss": 0.6066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9184, "tokens_per_second_per_gpu": 10125.52, "total_tokens": 906907920 }, { "epoch": 0.5742060515128782, "grad_norm": 0.9037635326385498, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9185, "tokens_per_second_per_gpu": 10414.49, "total_tokens": 907005595 }, { "epoch": 0.5742685671417854, "grad_norm": 0.8855879306793213, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9186, "tokens_per_second_per_gpu": 10154.14, "total_tokens": 907103531 }, { "epoch": 0.5743310827706927, "grad_norm": 0.8844151496887207, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9187, "tokens_per_second_per_gpu": 10317.5, "total_tokens": 907200913 }, { "epoch": 0.5743935983995999, "grad_norm": 0.9467970132827759, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9188, "tokens_per_second_per_gpu": 10227.55, "total_tokens": 907295934 }, { "epoch": 0.5744561140285072, "grad_norm": 0.883126437664032, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9189, "tokens_per_second_per_gpu": 10800.84, "total_tokens": 907393447 }, { "epoch": 0.5745186296574144, "grad_norm": 0.8630276322364807, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9190, "tokens_per_second_per_gpu": 10484.6, "total_tokens": 907494938 }, { "epoch": 0.5745811452863215, "grad_norm": 0.8773976564407349, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9191, "tokens_per_second_per_gpu": 10952.02, "total_tokens": 907596622 }, { "epoch": 0.5746436609152288, "grad_norm": 0.8854815363883972, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9192, "tokens_per_second_per_gpu": 10657.02, "total_tokens": 907697061 }, { "epoch": 0.574706176544136, "grad_norm": 0.8738884329795837, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9193, "tokens_per_second_per_gpu": 10364.23, "total_tokens": 907794764 }, { "epoch": 0.5747686921730433, "grad_norm": 0.9033172726631165, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9194, "tokens_per_second_per_gpu": 9769.58, "total_tokens": 907888872 }, { "epoch": 0.5748312078019505, "grad_norm": 0.8881169557571411, "learning_rate": 2e-05, "loss": 0.6843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9195, "tokens_per_second_per_gpu": 11010.23, "total_tokens": 907991436 }, { "epoch": 0.5748937234308578, "grad_norm": 0.8891538977622986, "learning_rate": 2e-05, "loss": 0.6608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9196, "tokens_per_second_per_gpu": 10689.49, "total_tokens": 908089703 }, { "epoch": 0.5749562390597649, "grad_norm": 0.8955762982368469, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9197, "tokens_per_second_per_gpu": 9955.84, "total_tokens": 908183024 }, { "epoch": 0.5750187546886721, "grad_norm": 0.8931479454040527, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9198, "tokens_per_second_per_gpu": 10520.25, "total_tokens": 908281827 }, { "epoch": 0.5750812703175794, "grad_norm": 0.844706118106842, "learning_rate": 2e-05, "loss": 0.6405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9199, "tokens_per_second_per_gpu": 10943.54, "total_tokens": 908385372 }, { "epoch": 0.5751437859464866, "grad_norm": 0.9189333915710449, "learning_rate": 2e-05, "loss": 0.6656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9200, "tokens_per_second_per_gpu": 10596.87, "total_tokens": 908486250 }, { "epoch": 0.5752063015753939, "grad_norm": 0.8711753487586975, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9201, "tokens_per_second_per_gpu": 10213.18, "total_tokens": 908583668 }, { "epoch": 0.5752688172043011, "grad_norm": 0.9062653183937073, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9202, "tokens_per_second_per_gpu": 10424.46, "total_tokens": 908682840 }, { "epoch": 0.5753313328332083, "grad_norm": 0.8692642450332642, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9203, "tokens_per_second_per_gpu": 10994.71, "total_tokens": 908780715 }, { "epoch": 0.5753938484621155, "grad_norm": 0.8992838263511658, "learning_rate": 2e-05, "loss": 0.6969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9204, "tokens_per_second_per_gpu": 10567.28, "total_tokens": 908878758 }, { "epoch": 0.5754563640910227, "grad_norm": 0.8708325028419495, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9205, "tokens_per_second_per_gpu": 10518.11, "total_tokens": 908978801 }, { "epoch": 0.57551887971993, "grad_norm": 0.8875840306282043, "learning_rate": 2e-05, "loss": 0.6026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9206, "tokens_per_second_per_gpu": 9504.72, "total_tokens": 909072287 }, { "epoch": 0.5755813953488372, "grad_norm": 0.9004718661308289, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9207, "tokens_per_second_per_gpu": 10986.49, "total_tokens": 909171952 }, { "epoch": 0.5756439109777445, "grad_norm": 0.8698639273643494, "learning_rate": 2e-05, "loss": 0.6663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9208, "tokens_per_second_per_gpu": 10900.94, "total_tokens": 909273928 }, { "epoch": 0.5757064266066516, "grad_norm": 0.9330391883850098, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9209, "tokens_per_second_per_gpu": 11511.97, "total_tokens": 909375953 }, { "epoch": 0.5757689422355589, "grad_norm": 0.8894813656806946, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9210, "tokens_per_second_per_gpu": 9882.05, "total_tokens": 909472465 }, { "epoch": 0.5758314578644661, "grad_norm": 0.9164736270904541, "learning_rate": 2e-05, "loss": 0.6636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9211, "tokens_per_second_per_gpu": 9642.0, "total_tokens": 909570858 }, { "epoch": 0.5758939734933733, "grad_norm": 0.8952750563621521, "learning_rate": 2e-05, "loss": 0.6432, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9212, "tokens_per_second_per_gpu": 10256.61, "total_tokens": 909666668 }, { "epoch": 0.5759564891222806, "grad_norm": 0.9026777744293213, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9213, "tokens_per_second_per_gpu": 10669.09, "total_tokens": 909763979 }, { "epoch": 0.5760190047511878, "grad_norm": 0.8840540051460266, "learning_rate": 2e-05, "loss": 0.6741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9214, "tokens_per_second_per_gpu": 9998.3, "total_tokens": 909861340 }, { "epoch": 0.576081520380095, "grad_norm": 0.9100291132926941, "learning_rate": 2e-05, "loss": 0.6696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9215, "tokens_per_second_per_gpu": 10481.64, "total_tokens": 909960121 }, { "epoch": 0.5761440360090022, "grad_norm": 0.9102585315704346, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9216, "tokens_per_second_per_gpu": 10226.76, "total_tokens": 910055075 }, { "epoch": 0.5762065516379095, "grad_norm": 0.8816357851028442, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9217, "tokens_per_second_per_gpu": 10482.62, "total_tokens": 910151999 }, { "epoch": 0.5762690672668167, "grad_norm": 0.9262398481369019, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9218, "tokens_per_second_per_gpu": 10519.67, "total_tokens": 910244573 }, { "epoch": 0.576331582895724, "grad_norm": 0.8624870181083679, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9219, "tokens_per_second_per_gpu": 10473.44, "total_tokens": 910343474 }, { "epoch": 0.5763940985246312, "grad_norm": 0.8710129261016846, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9220, "tokens_per_second_per_gpu": 11014.68, "total_tokens": 910442959 }, { "epoch": 0.5764566141535384, "grad_norm": 0.9524421691894531, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9221, "tokens_per_second_per_gpu": 9913.14, "total_tokens": 910536392 }, { "epoch": 0.5765191297824456, "grad_norm": 0.8847980499267578, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9222, "tokens_per_second_per_gpu": 10673.36, "total_tokens": 910630990 }, { "epoch": 0.5765816454113528, "grad_norm": 0.8962589502334595, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9223, "tokens_per_second_per_gpu": 10864.74, "total_tokens": 910727569 }, { "epoch": 0.5766441610402601, "grad_norm": 0.8685939311981201, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9224, "tokens_per_second_per_gpu": 10216.68, "total_tokens": 910825063 }, { "epoch": 0.5767066766691673, "grad_norm": 0.9426119923591614, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9225, "tokens_per_second_per_gpu": 10950.68, "total_tokens": 910922786 }, { "epoch": 0.5767691922980746, "grad_norm": 0.9350255727767944, "learning_rate": 2e-05, "loss": 0.6795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9226, "tokens_per_second_per_gpu": 10032.59, "total_tokens": 911018231 }, { "epoch": 0.5768317079269818, "grad_norm": 0.8980101346969604, "learning_rate": 2e-05, "loss": 0.6744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9227, "tokens_per_second_per_gpu": 9989.41, "total_tokens": 911117795 }, { "epoch": 0.5768942235558889, "grad_norm": 0.8743669986724854, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9228, "tokens_per_second_per_gpu": 10881.11, "total_tokens": 911216298 }, { "epoch": 0.5769567391847962, "grad_norm": 0.8738398551940918, "learning_rate": 2e-05, "loss": 0.6458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9229, "tokens_per_second_per_gpu": 11213.67, "total_tokens": 911319917 }, { "epoch": 0.5770192548137034, "grad_norm": 0.8712477684020996, "learning_rate": 2e-05, "loss": 0.5887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9230, "tokens_per_second_per_gpu": 10379.14, "total_tokens": 911413644 }, { "epoch": 0.5770817704426107, "grad_norm": 0.8670535087585449, "learning_rate": 2e-05, "loss": 0.6486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9231, "tokens_per_second_per_gpu": 10772.95, "total_tokens": 911515456 }, { "epoch": 0.5771442860715179, "grad_norm": 0.9043835401535034, "learning_rate": 2e-05, "loss": 0.6733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9232, "tokens_per_second_per_gpu": 10612.02, "total_tokens": 911615709 }, { "epoch": 0.5772068017004252, "grad_norm": 0.9013758301734924, "learning_rate": 2e-05, "loss": 0.6706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9233, "tokens_per_second_per_gpu": 9809.64, "total_tokens": 911714033 }, { "epoch": 0.5772693173293323, "grad_norm": 0.8774292469024658, "learning_rate": 2e-05, "loss": 0.6247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9234, "tokens_per_second_per_gpu": 10480.5, "total_tokens": 911812293 }, { "epoch": 0.5773318329582395, "grad_norm": 0.8870030045509338, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9235, "tokens_per_second_per_gpu": 11171.05, "total_tokens": 911911443 }, { "epoch": 0.5773943485871468, "grad_norm": 0.9134788513183594, "learning_rate": 2e-05, "loss": 0.6534, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9236, "tokens_per_second_per_gpu": 10509.8, "total_tokens": 912008913 }, { "epoch": 0.577456864216054, "grad_norm": 0.9329454898834229, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9237, "tokens_per_second_per_gpu": 9899.77, "total_tokens": 912103395 }, { "epoch": 0.5775193798449613, "grad_norm": 0.9158583283424377, "learning_rate": 2e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9238, "tokens_per_second_per_gpu": 10468.84, "total_tokens": 912199064 }, { "epoch": 0.5775818954738685, "grad_norm": 0.8876002430915833, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9239, "tokens_per_second_per_gpu": 10979.87, "total_tokens": 912296273 }, { "epoch": 0.5776444111027756, "grad_norm": 0.9315711855888367, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9240, "tokens_per_second_per_gpu": 10745.39, "total_tokens": 912395026 }, { "epoch": 0.5777069267316829, "grad_norm": 0.8912149667739868, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9241, "tokens_per_second_per_gpu": 11155.06, "total_tokens": 912494914 }, { "epoch": 0.5777694423605901, "grad_norm": 0.9087510704994202, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9242, "tokens_per_second_per_gpu": 9901.59, "total_tokens": 912592877 }, { "epoch": 0.5778319579894974, "grad_norm": 0.8839424848556519, "learning_rate": 2e-05, "loss": 0.6302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9243, "tokens_per_second_per_gpu": 10824.78, "total_tokens": 912691122 }, { "epoch": 0.5778944736184046, "grad_norm": 0.9080405235290527, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9244, "tokens_per_second_per_gpu": 10073.06, "total_tokens": 912787353 }, { "epoch": 0.5779569892473119, "grad_norm": 0.9025023579597473, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9245, "tokens_per_second_per_gpu": 9778.33, "total_tokens": 912881811 }, { "epoch": 0.578019504876219, "grad_norm": 0.9478859901428223, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9246, "tokens_per_second_per_gpu": 9709.74, "total_tokens": 912971623 }, { "epoch": 0.5780820205051262, "grad_norm": 0.8445065021514893, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9247, "tokens_per_second_per_gpu": 10752.33, "total_tokens": 913071887 }, { "epoch": 0.5781445361340335, "grad_norm": 0.8763079643249512, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9248, "tokens_per_second_per_gpu": 10538.12, "total_tokens": 913170418 }, { "epoch": 0.5782070517629407, "grad_norm": 0.879914402961731, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9249, "tokens_per_second_per_gpu": 9959.91, "total_tokens": 913263800 }, { "epoch": 0.578269567391848, "grad_norm": 0.8993577361106873, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9250, "tokens_per_second_per_gpu": 10506.19, "total_tokens": 913362355 }, { "epoch": 0.5783320830207552, "grad_norm": 0.9014998078346252, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9251, "tokens_per_second_per_gpu": 10542.68, "total_tokens": 913462355 }, { "epoch": 0.5783945986496624, "grad_norm": 0.9422488212585449, "learning_rate": 2e-05, "loss": 0.667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9252, "tokens_per_second_per_gpu": 9858.03, "total_tokens": 913556145 }, { "epoch": 0.5784571142785696, "grad_norm": 0.9162529110908508, "learning_rate": 2e-05, "loss": 0.6132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9253, "tokens_per_second_per_gpu": 9985.52, "total_tokens": 913647656 }, { "epoch": 0.5785196299074769, "grad_norm": 0.9283340573310852, "learning_rate": 2e-05, "loss": 0.6508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9254, "tokens_per_second_per_gpu": 9681.61, "total_tokens": 913742532 }, { "epoch": 0.5785821455363841, "grad_norm": 0.9035683870315552, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9255, "tokens_per_second_per_gpu": 10685.7, "total_tokens": 913839231 }, { "epoch": 0.5786446611652913, "grad_norm": 0.90379798412323, "learning_rate": 2e-05, "loss": 0.6211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9256, "tokens_per_second_per_gpu": 10039.91, "total_tokens": 913936585 }, { "epoch": 0.5787071767941986, "grad_norm": 0.8923540115356445, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9257, "tokens_per_second_per_gpu": 10490.7, "total_tokens": 914032830 }, { "epoch": 0.5787696924231058, "grad_norm": 0.9227358102798462, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9258, "tokens_per_second_per_gpu": 10074.91, "total_tokens": 914128867 }, { "epoch": 0.578832208052013, "grad_norm": 0.8985109925270081, "learning_rate": 2e-05, "loss": 0.6365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9259, "tokens_per_second_per_gpu": 10221.0, "total_tokens": 914228957 }, { "epoch": 0.5788947236809202, "grad_norm": 0.8717222213745117, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9260, "tokens_per_second_per_gpu": 11034.58, "total_tokens": 914329443 }, { "epoch": 0.5789572393098275, "grad_norm": 0.8844960331916809, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9261, "tokens_per_second_per_gpu": 9851.59, "total_tokens": 914421751 }, { "epoch": 0.5790197549387347, "grad_norm": 0.9219273924827576, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9262, "tokens_per_second_per_gpu": 10200.25, "total_tokens": 914520197 }, { "epoch": 0.5790822705676419, "grad_norm": 0.8746908903121948, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9263, "tokens_per_second_per_gpu": 10630.49, "total_tokens": 914614509 }, { "epoch": 0.5791447861965492, "grad_norm": 0.8761359453201294, "learning_rate": 2e-05, "loss": 0.7077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9264, "tokens_per_second_per_gpu": 11371.09, "total_tokens": 914718034 }, { "epoch": 0.5792073018254563, "grad_norm": 0.9364857077598572, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9265, "tokens_per_second_per_gpu": 10550.37, "total_tokens": 914819717 }, { "epoch": 0.5792698174543636, "grad_norm": 0.9013404846191406, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9266, "tokens_per_second_per_gpu": 10981.89, "total_tokens": 914918019 }, { "epoch": 0.5793323330832708, "grad_norm": 0.8996241688728333, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9267, "tokens_per_second_per_gpu": 10987.68, "total_tokens": 915017297 }, { "epoch": 0.5793948487121781, "grad_norm": 0.9216161966323853, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9268, "tokens_per_second_per_gpu": 9927.63, "total_tokens": 915113828 }, { "epoch": 0.5794573643410853, "grad_norm": 0.922762930393219, "learning_rate": 2e-05, "loss": 0.6554, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9269, "tokens_per_second_per_gpu": 9521.88, "total_tokens": 915207629 }, { "epoch": 0.5795198799699925, "grad_norm": 0.8779377937316895, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9270, "tokens_per_second_per_gpu": 10236.32, "total_tokens": 915305687 }, { "epoch": 0.5795823955988997, "grad_norm": 0.9153302907943726, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9271, "tokens_per_second_per_gpu": 10693.34, "total_tokens": 915402761 }, { "epoch": 0.5796449112278069, "grad_norm": 0.8691859841346741, "learning_rate": 2e-05, "loss": 0.6536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9272, "tokens_per_second_per_gpu": 16557.28, "total_tokens": 915500609 }, { "epoch": 0.5797074268567142, "grad_norm": 0.8984445333480835, "learning_rate": 2e-05, "loss": 0.5972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9273, "tokens_per_second_per_gpu": 10590.69, "total_tokens": 915596403 }, { "epoch": 0.5797699424856214, "grad_norm": 0.8935994505882263, "learning_rate": 2e-05, "loss": 0.6487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9274, "tokens_per_second_per_gpu": 9529.64, "total_tokens": 915689067 }, { "epoch": 0.5798324581145287, "grad_norm": 0.8908703327178955, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9275, "tokens_per_second_per_gpu": 10961.64, "total_tokens": 915789503 }, { "epoch": 0.5798949737434359, "grad_norm": 0.8591753244400024, "learning_rate": 2e-05, "loss": 0.6663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9276, "tokens_per_second_per_gpu": 11547.03, "total_tokens": 915895394 }, { "epoch": 0.579957489372343, "grad_norm": 0.9563401937484741, "learning_rate": 2e-05, "loss": 0.6554, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9277, "tokens_per_second_per_gpu": 10497.29, "total_tokens": 915990440 }, { "epoch": 0.5800200050012503, "grad_norm": 0.8878481984138489, "learning_rate": 2e-05, "loss": 0.671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9278, "tokens_per_second_per_gpu": 11177.06, "total_tokens": 916091321 }, { "epoch": 0.5800825206301575, "grad_norm": 0.9066374897956848, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9279, "tokens_per_second_per_gpu": 10087.92, "total_tokens": 916189055 }, { "epoch": 0.5801450362590648, "grad_norm": 0.8786858320236206, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9280, "tokens_per_second_per_gpu": 10150.64, "total_tokens": 916288000 }, { "epoch": 0.580207551887972, "grad_norm": 0.8636163473129272, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9281, "tokens_per_second_per_gpu": 10434.45, "total_tokens": 916385355 }, { "epoch": 0.5802700675168793, "grad_norm": 0.881727397441864, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9282, "tokens_per_second_per_gpu": 10030.03, "total_tokens": 916482167 }, { "epoch": 0.5803325831457864, "grad_norm": 0.9131853580474854, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9283, "tokens_per_second_per_gpu": 10367.43, "total_tokens": 916580363 }, { "epoch": 0.5803950987746936, "grad_norm": 0.9180923104286194, "learning_rate": 2e-05, "loss": 0.6365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9284, "tokens_per_second_per_gpu": 10183.22, "total_tokens": 916672922 }, { "epoch": 0.5804576144036009, "grad_norm": 0.8795541524887085, "learning_rate": 2e-05, "loss": 0.6499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9285, "tokens_per_second_per_gpu": 10793.65, "total_tokens": 916772460 }, { "epoch": 0.5805201300325081, "grad_norm": 0.8954836130142212, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9286, "tokens_per_second_per_gpu": 10451.19, "total_tokens": 916869225 }, { "epoch": 0.5805826456614154, "grad_norm": 0.9149262309074402, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9287, "tokens_per_second_per_gpu": 10239.32, "total_tokens": 916965429 }, { "epoch": 0.5806451612903226, "grad_norm": 0.9011726379394531, "learning_rate": 2e-05, "loss": 0.6, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9288, "tokens_per_second_per_gpu": 10197.84, "total_tokens": 917059664 }, { "epoch": 0.5807076769192298, "grad_norm": 0.9265313744544983, "learning_rate": 2e-05, "loss": 0.6065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9289, "tokens_per_second_per_gpu": 9621.25, "total_tokens": 917152832 }, { "epoch": 0.580770192548137, "grad_norm": 0.8709397315979004, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9290, "tokens_per_second_per_gpu": 10343.7, "total_tokens": 917251384 }, { "epoch": 0.5808327081770442, "grad_norm": 0.902446985244751, "learning_rate": 2e-05, "loss": 0.6591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9291, "tokens_per_second_per_gpu": 9963.83, "total_tokens": 917346676 }, { "epoch": 0.5808952238059515, "grad_norm": 0.8689212799072266, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9292, "tokens_per_second_per_gpu": 10785.4, "total_tokens": 917443347 }, { "epoch": 0.5809577394348587, "grad_norm": 0.8597619533538818, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9293, "tokens_per_second_per_gpu": 10687.72, "total_tokens": 917543651 }, { "epoch": 0.581020255063766, "grad_norm": 0.8363856077194214, "learning_rate": 2e-05, "loss": 0.591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9294, "tokens_per_second_per_gpu": 10855.8, "total_tokens": 917641206 }, { "epoch": 0.5810827706926732, "grad_norm": 0.8955033421516418, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9295, "tokens_per_second_per_gpu": 10004.3, "total_tokens": 917737705 }, { "epoch": 0.5811452863215804, "grad_norm": 0.8899802565574646, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9296, "tokens_per_second_per_gpu": 10610.73, "total_tokens": 917839713 }, { "epoch": 0.5812078019504876, "grad_norm": 0.9252933859825134, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9297, "tokens_per_second_per_gpu": 9571.26, "total_tokens": 917934305 }, { "epoch": 0.5812703175793948, "grad_norm": 0.9420686960220337, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9298, "tokens_per_second_per_gpu": 9982.85, "total_tokens": 918029653 }, { "epoch": 0.5813328332083021, "grad_norm": 0.9008073210716248, "learning_rate": 2e-05, "loss": 0.6195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9299, "tokens_per_second_per_gpu": 10559.58, "total_tokens": 918125491 }, { "epoch": 0.5813953488372093, "grad_norm": 0.9076144099235535, "learning_rate": 2e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9300, "tokens_per_second_per_gpu": 10300.34, "total_tokens": 918222461 }, { "epoch": 0.5814578644661166, "grad_norm": 0.8787897825241089, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9301, "tokens_per_second_per_gpu": 10647.65, "total_tokens": 918323688 }, { "epoch": 0.5815203800950237, "grad_norm": 0.8955520391464233, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9302, "tokens_per_second_per_gpu": 10835.65, "total_tokens": 918421762 }, { "epoch": 0.581582895723931, "grad_norm": 0.9420290589332581, "learning_rate": 2e-05, "loss": 0.6796, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9303, "tokens_per_second_per_gpu": 10218.6, "total_tokens": 918517585 }, { "epoch": 0.5816454113528382, "grad_norm": 0.9304585456848145, "learning_rate": 2e-05, "loss": 0.7, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9304, "tokens_per_second_per_gpu": 10979.35, "total_tokens": 918614952 }, { "epoch": 0.5817079269817454, "grad_norm": 0.9232817888259888, "learning_rate": 2e-05, "loss": 0.6874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9305, "tokens_per_second_per_gpu": 10318.42, "total_tokens": 918711068 }, { "epoch": 0.5817704426106527, "grad_norm": 0.8848899006843567, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9306, "tokens_per_second_per_gpu": 10340.38, "total_tokens": 918810055 }, { "epoch": 0.5818329582395599, "grad_norm": 0.9160789847373962, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9307, "tokens_per_second_per_gpu": 10241.4, "total_tokens": 918910025 }, { "epoch": 0.5818954738684671, "grad_norm": 0.8568123579025269, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9308, "tokens_per_second_per_gpu": 10537.8, "total_tokens": 919005354 }, { "epoch": 0.5819579894973743, "grad_norm": 0.8772767782211304, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9309, "tokens_per_second_per_gpu": 10622.67, "total_tokens": 919103145 }, { "epoch": 0.5820205051262816, "grad_norm": 0.9034042358398438, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9310, "tokens_per_second_per_gpu": 10850.59, "total_tokens": 919199211 }, { "epoch": 0.5820830207551888, "grad_norm": 0.9740894436836243, "learning_rate": 2e-05, "loss": 0.6716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9311, "tokens_per_second_per_gpu": 10199.9, "total_tokens": 919296222 }, { "epoch": 0.582145536384096, "grad_norm": 0.9236181378364563, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9312, "tokens_per_second_per_gpu": 10042.47, "total_tokens": 919395702 }, { "epoch": 0.5822080520130033, "grad_norm": 1.0088516473770142, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9313, "tokens_per_second_per_gpu": 11069.52, "total_tokens": 919494997 }, { "epoch": 0.5822705676419104, "grad_norm": 0.895606517791748, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9314, "tokens_per_second_per_gpu": 10159.61, "total_tokens": 919592201 }, { "epoch": 0.5823330832708177, "grad_norm": 0.8527589440345764, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9315, "tokens_per_second_per_gpu": 10205.54, "total_tokens": 919688594 }, { "epoch": 0.5823955988997249, "grad_norm": 0.8635093569755554, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9316, "tokens_per_second_per_gpu": 10642.88, "total_tokens": 919789226 }, { "epoch": 0.5824581145286322, "grad_norm": 0.8902029991149902, "learning_rate": 2e-05, "loss": 0.6892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9317, "tokens_per_second_per_gpu": 10702.46, "total_tokens": 919890921 }, { "epoch": 0.5825206301575394, "grad_norm": 0.9016848206520081, "learning_rate": 2e-05, "loss": 0.6892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9318, "tokens_per_second_per_gpu": 10857.26, "total_tokens": 919991910 }, { "epoch": 0.5825831457864467, "grad_norm": 0.877612292766571, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9319, "tokens_per_second_per_gpu": 10006.48, "total_tokens": 920090255 }, { "epoch": 0.5826456614153538, "grad_norm": 0.9105482697486877, "learning_rate": 2e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9320, "tokens_per_second_per_gpu": 10532.8, "total_tokens": 920190492 }, { "epoch": 0.582708177044261, "grad_norm": 0.8987957835197449, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9321, "tokens_per_second_per_gpu": 10433.49, "total_tokens": 920287773 }, { "epoch": 0.5827706926731683, "grad_norm": 0.8747284412384033, "learning_rate": 2e-05, "loss": 0.6887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9322, "tokens_per_second_per_gpu": 11507.18, "total_tokens": 920388842 }, { "epoch": 0.5828332083020755, "grad_norm": 0.9056312441825867, "learning_rate": 2e-05, "loss": 0.592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9323, "tokens_per_second_per_gpu": 10294.82, "total_tokens": 920481905 }, { "epoch": 0.5828957239309828, "grad_norm": 0.9107871055603027, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9324, "tokens_per_second_per_gpu": 10656.48, "total_tokens": 920583544 }, { "epoch": 0.58295823955989, "grad_norm": 0.8981768488883972, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9325, "tokens_per_second_per_gpu": 10536.38, "total_tokens": 920680260 }, { "epoch": 0.5830207551887971, "grad_norm": 0.8967844843864441, "learning_rate": 2e-05, "loss": 0.606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9326, "tokens_per_second_per_gpu": 10532.71, "total_tokens": 920777676 }, { "epoch": 0.5830832708177044, "grad_norm": 0.9039160013198853, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9327, "tokens_per_second_per_gpu": 10824.1, "total_tokens": 920876718 }, { "epoch": 0.5831457864466116, "grad_norm": 0.89002925157547, "learning_rate": 2e-05, "loss": 0.6711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9328, "tokens_per_second_per_gpu": 10801.94, "total_tokens": 920976973 }, { "epoch": 0.5832083020755189, "grad_norm": 0.8883742094039917, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9329, "tokens_per_second_per_gpu": 10890.42, "total_tokens": 921076492 }, { "epoch": 0.5832708177044261, "grad_norm": 0.8782421946525574, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9330, "tokens_per_second_per_gpu": 10695.76, "total_tokens": 921173118 }, { "epoch": 0.5833333333333334, "grad_norm": 0.8878767490386963, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9331, "tokens_per_second_per_gpu": 10112.99, "total_tokens": 921270908 }, { "epoch": 0.5833958489622406, "grad_norm": 0.8550350069999695, "learning_rate": 2e-05, "loss": 0.6633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9332, "tokens_per_second_per_gpu": 11425.89, "total_tokens": 921373581 }, { "epoch": 0.5834583645911477, "grad_norm": 0.8654718995094299, "learning_rate": 2e-05, "loss": 0.5688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9333, "tokens_per_second_per_gpu": 10052.69, "total_tokens": 921467063 }, { "epoch": 0.583520880220055, "grad_norm": 0.8857782483100891, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9334, "tokens_per_second_per_gpu": 10607.67, "total_tokens": 921563797 }, { "epoch": 0.5835833958489622, "grad_norm": 0.8935872316360474, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9335, "tokens_per_second_per_gpu": 11542.04, "total_tokens": 921667960 }, { "epoch": 0.5836459114778695, "grad_norm": 0.8765181303024292, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9336, "tokens_per_second_per_gpu": 10097.56, "total_tokens": 921766756 }, { "epoch": 0.5837084271067767, "grad_norm": 0.9780294299125671, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9337, "tokens_per_second_per_gpu": 9391.12, "total_tokens": 921856720 }, { "epoch": 0.583770942735684, "grad_norm": 0.9504391551017761, "learning_rate": 2e-05, "loss": 0.5979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9338, "tokens_per_second_per_gpu": 9227.32, "total_tokens": 921946427 }, { "epoch": 0.5838334583645911, "grad_norm": 0.9187176823616028, "learning_rate": 2e-05, "loss": 0.6671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9339, "tokens_per_second_per_gpu": 10347.18, "total_tokens": 922040846 }, { "epoch": 0.5838959739934984, "grad_norm": 0.9322976469993591, "learning_rate": 2e-05, "loss": 0.6317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9340, "tokens_per_second_per_gpu": 9665.9, "total_tokens": 922134254 }, { "epoch": 0.5839584896224056, "grad_norm": 0.9510199427604675, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9341, "tokens_per_second_per_gpu": 9903.82, "total_tokens": 922229845 }, { "epoch": 0.5840210052513128, "grad_norm": 0.8862713575363159, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9342, "tokens_per_second_per_gpu": 10292.65, "total_tokens": 922326995 }, { "epoch": 0.5840835208802201, "grad_norm": 0.9743895530700684, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9343, "tokens_per_second_per_gpu": 10425.41, "total_tokens": 922423644 }, { "epoch": 0.5841460365091273, "grad_norm": 0.9266406297683716, "learning_rate": 2e-05, "loss": 0.6674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9344, "tokens_per_second_per_gpu": 10534.31, "total_tokens": 922517792 }, { "epoch": 0.5842085521380345, "grad_norm": 0.8699267506599426, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9345, "tokens_per_second_per_gpu": 9947.35, "total_tokens": 922613808 }, { "epoch": 0.5842710677669417, "grad_norm": 0.9337587356567383, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9346, "tokens_per_second_per_gpu": 10478.6, "total_tokens": 922706863 }, { "epoch": 0.584333583395849, "grad_norm": 0.9223719239234924, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9347, "tokens_per_second_per_gpu": 10234.97, "total_tokens": 922800991 }, { "epoch": 0.5843960990247562, "grad_norm": 0.8672981262207031, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9348, "tokens_per_second_per_gpu": 10197.82, "total_tokens": 922900794 }, { "epoch": 0.5844586146536634, "grad_norm": 0.9058066010475159, "learning_rate": 2e-05, "loss": 0.6688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9349, "tokens_per_second_per_gpu": 10678.2, "total_tokens": 922999443 }, { "epoch": 0.5845211302825707, "grad_norm": 0.8631483912467957, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9350, "tokens_per_second_per_gpu": 10497.33, "total_tokens": 923100013 }, { "epoch": 0.5845836459114778, "grad_norm": 0.8602228760719299, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9351, "tokens_per_second_per_gpu": 10438.18, "total_tokens": 923197133 }, { "epoch": 0.5846461615403851, "grad_norm": 0.89875727891922, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9352, "tokens_per_second_per_gpu": 11099.76, "total_tokens": 923296874 }, { "epoch": 0.5847086771692923, "grad_norm": 0.8791815638542175, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9353, "tokens_per_second_per_gpu": 10878.33, "total_tokens": 923397851 }, { "epoch": 0.5847711927981996, "grad_norm": 0.9016631245613098, "learning_rate": 2e-05, "loss": 0.5947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9354, "tokens_per_second_per_gpu": 10121.68, "total_tokens": 923493275 }, { "epoch": 0.5848337084271068, "grad_norm": 0.9090139865875244, "learning_rate": 2e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9355, "tokens_per_second_per_gpu": 10968.17, "total_tokens": 923595984 }, { "epoch": 0.584896224056014, "grad_norm": 0.8885989189147949, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9356, "tokens_per_second_per_gpu": 10978.95, "total_tokens": 923694186 }, { "epoch": 0.5849587396849212, "grad_norm": 0.9060112833976746, "learning_rate": 2e-05, "loss": 0.6635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9357, "tokens_per_second_per_gpu": 10119.33, "total_tokens": 923792998 }, { "epoch": 0.5850212553138284, "grad_norm": 0.9212630391120911, "learning_rate": 2e-05, "loss": 0.6916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9358, "tokens_per_second_per_gpu": 10688.12, "total_tokens": 923893623 }, { "epoch": 0.5850837709427357, "grad_norm": 0.9126282930374146, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9359, "tokens_per_second_per_gpu": 10946.05, "total_tokens": 923991665 }, { "epoch": 0.5851462865716429, "grad_norm": 0.9222660064697266, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9360, "tokens_per_second_per_gpu": 11063.38, "total_tokens": 924094986 }, { "epoch": 0.5852088022005502, "grad_norm": 0.8983381986618042, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9361, "tokens_per_second_per_gpu": 10679.53, "total_tokens": 924195681 }, { "epoch": 0.5852713178294574, "grad_norm": 0.8910735249519348, "learning_rate": 2e-05, "loss": 0.5942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9362, "tokens_per_second_per_gpu": 10553.09, "total_tokens": 924290630 }, { "epoch": 0.5853338334583645, "grad_norm": 0.9016518592834473, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9363, "tokens_per_second_per_gpu": 10282.25, "total_tokens": 924389172 }, { "epoch": 0.5853963490872718, "grad_norm": 0.8859212398529053, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9364, "tokens_per_second_per_gpu": 9835.68, "total_tokens": 924483594 }, { "epoch": 0.585458864716179, "grad_norm": 0.8549920916557312, "learning_rate": 2e-05, "loss": 0.6065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9365, "tokens_per_second_per_gpu": 10634.56, "total_tokens": 924583462 }, { "epoch": 0.5855213803450863, "grad_norm": 0.8730600476264954, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9366, "tokens_per_second_per_gpu": 10018.84, "total_tokens": 924678599 }, { "epoch": 0.5855838959739935, "grad_norm": 0.8551077842712402, "learning_rate": 2e-05, "loss": 0.5961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9367, "tokens_per_second_per_gpu": 10269.72, "total_tokens": 924776474 }, { "epoch": 0.5856464116029008, "grad_norm": 0.8423683047294617, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9368, "tokens_per_second_per_gpu": 11414.14, "total_tokens": 924880318 }, { "epoch": 0.5857089272318079, "grad_norm": 0.8635967373847961, "learning_rate": 2e-05, "loss": 0.6538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9369, "tokens_per_second_per_gpu": 11181.09, "total_tokens": 924981478 }, { "epoch": 0.5857714428607151, "grad_norm": 0.9233448505401611, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9370, "tokens_per_second_per_gpu": 11291.25, "total_tokens": 925082643 }, { "epoch": 0.5858339584896224, "grad_norm": 0.8956215977668762, "learning_rate": 2e-05, "loss": 0.674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9371, "tokens_per_second_per_gpu": 10441.91, "total_tokens": 925183537 }, { "epoch": 0.5858964741185296, "grad_norm": 0.9041163325309753, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9372, "tokens_per_second_per_gpu": 10294.54, "total_tokens": 925280610 }, { "epoch": 0.5859589897474369, "grad_norm": 0.8745344877243042, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9373, "tokens_per_second_per_gpu": 10883.39, "total_tokens": 925383155 }, { "epoch": 0.5860215053763441, "grad_norm": 0.8586344122886658, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9374, "tokens_per_second_per_gpu": 10617.43, "total_tokens": 925477442 }, { "epoch": 0.5860840210052514, "grad_norm": 0.8767625689506531, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9375, "tokens_per_second_per_gpu": 11312.76, "total_tokens": 925575415 }, { "epoch": 0.5861465366341585, "grad_norm": 0.9038676619529724, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9376, "tokens_per_second_per_gpu": 10082.65, "total_tokens": 925673567 }, { "epoch": 0.5862090522630657, "grad_norm": 0.8996225595474243, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9377, "tokens_per_second_per_gpu": 10354.0, "total_tokens": 925772647 }, { "epoch": 0.586271567891973, "grad_norm": 0.9075721502304077, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9378, "tokens_per_second_per_gpu": 11017.4, "total_tokens": 925866252 }, { "epoch": 0.5863340835208802, "grad_norm": 0.9324693083763123, "learning_rate": 2e-05, "loss": 0.6426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9379, "tokens_per_second_per_gpu": 9861.82, "total_tokens": 925963422 }, { "epoch": 0.5863965991497875, "grad_norm": 0.8619415163993835, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9380, "tokens_per_second_per_gpu": 10548.05, "total_tokens": 926064856 }, { "epoch": 0.5864591147786947, "grad_norm": 0.9098606109619141, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9381, "tokens_per_second_per_gpu": 10346.97, "total_tokens": 926162668 }, { "epoch": 0.5865216304076019, "grad_norm": 0.9061547517776489, "learning_rate": 2e-05, "loss": 0.6145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9382, "tokens_per_second_per_gpu": 10808.84, "total_tokens": 926259049 }, { "epoch": 0.5865841460365091, "grad_norm": 0.8759748339653015, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9383, "tokens_per_second_per_gpu": 10866.16, "total_tokens": 926359325 }, { "epoch": 0.5866466616654163, "grad_norm": 0.8745761513710022, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9384, "tokens_per_second_per_gpu": 11038.74, "total_tokens": 926461050 }, { "epoch": 0.5867091772943236, "grad_norm": 0.8893464803695679, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9385, "tokens_per_second_per_gpu": 10164.54, "total_tokens": 926557379 }, { "epoch": 0.5867716929232308, "grad_norm": 0.8719837069511414, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9386, "tokens_per_second_per_gpu": 10849.87, "total_tokens": 926657513 }, { "epoch": 0.5868342085521381, "grad_norm": 0.8907215595245361, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9387, "tokens_per_second_per_gpu": 10395.01, "total_tokens": 926752982 }, { "epoch": 0.5868967241810452, "grad_norm": 0.8817213773727417, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9388, "tokens_per_second_per_gpu": 11209.2, "total_tokens": 926855781 }, { "epoch": 0.5869592398099525, "grad_norm": 0.8865199685096741, "learning_rate": 2e-05, "loss": 0.6036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9389, "tokens_per_second_per_gpu": 10450.83, "total_tokens": 926957194 }, { "epoch": 0.5870217554388597, "grad_norm": 0.9048195481300354, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9390, "tokens_per_second_per_gpu": 11016.65, "total_tokens": 927059042 }, { "epoch": 0.587084271067767, "grad_norm": 0.8467347621917725, "learning_rate": 2e-05, "loss": 0.5885, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9391, "tokens_per_second_per_gpu": 10554.22, "total_tokens": 927159529 }, { "epoch": 0.5871467866966742, "grad_norm": 0.9325504899024963, "learning_rate": 2e-05, "loss": 0.6049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9392, "tokens_per_second_per_gpu": 9257.87, "total_tokens": 927251862 }, { "epoch": 0.5872093023255814, "grad_norm": 0.9119670987129211, "learning_rate": 2e-05, "loss": 0.668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9393, "tokens_per_second_per_gpu": 10342.75, "total_tokens": 927350374 }, { "epoch": 0.5872718179544886, "grad_norm": 0.9001041650772095, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9394, "tokens_per_second_per_gpu": 10642.56, "total_tokens": 927447128 }, { "epoch": 0.5873343335833958, "grad_norm": 0.9176625609397888, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9395, "tokens_per_second_per_gpu": 11069.18, "total_tokens": 927547230 }, { "epoch": 0.5873968492123031, "grad_norm": 0.900506317615509, "learning_rate": 2e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9396, "tokens_per_second_per_gpu": 11347.97, "total_tokens": 927647431 }, { "epoch": 0.5874593648412103, "grad_norm": 0.9091394543647766, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9397, "tokens_per_second_per_gpu": 10068.68, "total_tokens": 927743030 }, { "epoch": 0.5875218804701176, "grad_norm": 0.8974677324295044, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9398, "tokens_per_second_per_gpu": 11235.34, "total_tokens": 927845594 }, { "epoch": 0.5875843960990248, "grad_norm": 0.8800586462020874, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9399, "tokens_per_second_per_gpu": 9957.6, "total_tokens": 927941401 }, { "epoch": 0.5876469117279319, "grad_norm": 0.8850120902061462, "learning_rate": 2e-05, "loss": 0.6725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9400, "tokens_per_second_per_gpu": 10737.52, "total_tokens": 928043551 }, { "epoch": 0.5877094273568392, "grad_norm": 0.9102883338928223, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9401, "tokens_per_second_per_gpu": 10748.44, "total_tokens": 928143775 }, { "epoch": 0.5877719429857464, "grad_norm": 0.9356865286827087, "learning_rate": 2e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9402, "tokens_per_second_per_gpu": 9772.85, "total_tokens": 928240653 }, { "epoch": 0.5878344586146537, "grad_norm": 0.8668432235717773, "learning_rate": 2e-05, "loss": 0.593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9403, "tokens_per_second_per_gpu": 10153.14, "total_tokens": 928337208 }, { "epoch": 0.5878969742435609, "grad_norm": 0.9074442386627197, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9404, "tokens_per_second_per_gpu": 10590.79, "total_tokens": 928436111 }, { "epoch": 0.5879594898724682, "grad_norm": 0.8896439671516418, "learning_rate": 2e-05, "loss": 0.664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9405, "tokens_per_second_per_gpu": 11017.08, "total_tokens": 928538583 }, { "epoch": 0.5880220055013753, "grad_norm": 0.9050403833389282, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9406, "tokens_per_second_per_gpu": 10482.29, "total_tokens": 928637758 }, { "epoch": 0.5880845211302825, "grad_norm": 1.709337830543518, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9407, "tokens_per_second_per_gpu": 10622.25, "total_tokens": 928735059 }, { "epoch": 0.5881470367591898, "grad_norm": 0.8789997696876526, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9408, "tokens_per_second_per_gpu": 10529.43, "total_tokens": 928836212 }, { "epoch": 0.588209552388097, "grad_norm": 0.9896772503852844, "learning_rate": 2e-05, "loss": 0.648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9409, "tokens_per_second_per_gpu": 10625.62, "total_tokens": 928935954 }, { "epoch": 0.5882720680170043, "grad_norm": 0.9365217685699463, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9410, "tokens_per_second_per_gpu": 10479.79, "total_tokens": 929036378 }, { "epoch": 0.5883345836459115, "grad_norm": 0.9232798218727112, "learning_rate": 2e-05, "loss": 0.6467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9411, "tokens_per_second_per_gpu": 10581.6, "total_tokens": 929137464 }, { "epoch": 0.5883970992748188, "grad_norm": 0.9456436634063721, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9412, "tokens_per_second_per_gpu": 11174.55, "total_tokens": 929238886 }, { "epoch": 0.5884596149037259, "grad_norm": 0.916772186756134, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9413, "tokens_per_second_per_gpu": 10005.15, "total_tokens": 929335149 }, { "epoch": 0.5885221305326331, "grad_norm": 0.8716717958450317, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9414, "tokens_per_second_per_gpu": 11151.9, "total_tokens": 929435316 }, { "epoch": 0.5885846461615404, "grad_norm": 0.9158414006233215, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9415, "tokens_per_second_per_gpu": 9882.42, "total_tokens": 929530622 }, { "epoch": 0.5886471617904476, "grad_norm": 0.9346785545349121, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9416, "tokens_per_second_per_gpu": 10968.66, "total_tokens": 929631148 }, { "epoch": 0.5887096774193549, "grad_norm": 0.8761019706726074, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9417, "tokens_per_second_per_gpu": 10074.48, "total_tokens": 929728725 }, { "epoch": 0.5887721930482621, "grad_norm": 0.9014268517494202, "learning_rate": 2e-05, "loss": 0.6924, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9418, "tokens_per_second_per_gpu": 11180.14, "total_tokens": 929833023 }, { "epoch": 0.5888347086771692, "grad_norm": 0.8635684251785278, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9419, "tokens_per_second_per_gpu": 10760.52, "total_tokens": 929933868 }, { "epoch": 0.5888972243060765, "grad_norm": 0.8976725935935974, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9420, "tokens_per_second_per_gpu": 10249.43, "total_tokens": 930033073 }, { "epoch": 0.5889597399349837, "grad_norm": 0.8803507089614868, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9421, "tokens_per_second_per_gpu": 10388.12, "total_tokens": 930129327 }, { "epoch": 0.589022255563891, "grad_norm": 0.8890717625617981, "learning_rate": 2e-05, "loss": 0.5791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9422, "tokens_per_second_per_gpu": 10303.94, "total_tokens": 930221531 }, { "epoch": 0.5890847711927982, "grad_norm": 0.9047715663909912, "learning_rate": 2e-05, "loss": 0.6247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9423, "tokens_per_second_per_gpu": 10832.96, "total_tokens": 930318930 }, { "epoch": 0.5891472868217055, "grad_norm": 0.9074051380157471, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9424, "tokens_per_second_per_gpu": 10538.54, "total_tokens": 930413247 }, { "epoch": 0.5892098024506126, "grad_norm": 0.8892760872840881, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9425, "tokens_per_second_per_gpu": 10904.98, "total_tokens": 930513534 }, { "epoch": 0.5892723180795199, "grad_norm": 0.9039164185523987, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9426, "tokens_per_second_per_gpu": 10686.87, "total_tokens": 930610114 }, { "epoch": 0.5893348337084271, "grad_norm": 0.8779803514480591, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9427, "tokens_per_second_per_gpu": 10659.36, "total_tokens": 930709274 }, { "epoch": 0.5893973493373343, "grad_norm": 0.8925896286964417, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9428, "tokens_per_second_per_gpu": 10392.1, "total_tokens": 930807236 }, { "epoch": 0.5894598649662416, "grad_norm": 0.8532010316848755, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9429, "tokens_per_second_per_gpu": 10726.96, "total_tokens": 930909274 }, { "epoch": 0.5895223805951488, "grad_norm": 0.9238168597221375, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9430, "tokens_per_second_per_gpu": 10269.28, "total_tokens": 931008411 }, { "epoch": 0.589584896224056, "grad_norm": 0.9324676990509033, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9431, "tokens_per_second_per_gpu": 9472.46, "total_tokens": 931103122 }, { "epoch": 0.5896474118529632, "grad_norm": 0.8634533882141113, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9432, "tokens_per_second_per_gpu": 11081.38, "total_tokens": 931202736 }, { "epoch": 0.5897099274818705, "grad_norm": 0.88011234998703, "learning_rate": 2e-05, "loss": 0.605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9433, "tokens_per_second_per_gpu": 10388.94, "total_tokens": 931299368 }, { "epoch": 0.5897724431107777, "grad_norm": 0.9055966138839722, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9434, "tokens_per_second_per_gpu": 10989.73, "total_tokens": 931396682 }, { "epoch": 0.589834958739685, "grad_norm": 0.8829355239868164, "learning_rate": 2e-05, "loss": 0.635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9435, "tokens_per_second_per_gpu": 9698.85, "total_tokens": 931491300 }, { "epoch": 0.5898974743685922, "grad_norm": 0.8704380393028259, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9436, "tokens_per_second_per_gpu": 10316.45, "total_tokens": 931589005 }, { "epoch": 0.5899599899974993, "grad_norm": 0.8830422759056091, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9437, "tokens_per_second_per_gpu": 10586.73, "total_tokens": 931690100 }, { "epoch": 0.5900225056264066, "grad_norm": 0.8944454193115234, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9438, "tokens_per_second_per_gpu": 11301.65, "total_tokens": 931791736 }, { "epoch": 0.5900850212553138, "grad_norm": 0.8984922766685486, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9439, "tokens_per_second_per_gpu": 11270.7, "total_tokens": 931893475 }, { "epoch": 0.5901475368842211, "grad_norm": 0.8693749308586121, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9440, "tokens_per_second_per_gpu": 11197.6, "total_tokens": 931994112 }, { "epoch": 0.5902100525131283, "grad_norm": 0.8633973598480225, "learning_rate": 2e-05, "loss": 0.6057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9441, "tokens_per_second_per_gpu": 10927.14, "total_tokens": 932093358 }, { "epoch": 0.5902725681420355, "grad_norm": 0.9127577543258667, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9442, "tokens_per_second_per_gpu": 11216.13, "total_tokens": 932194628 }, { "epoch": 0.5903350837709427, "grad_norm": 0.8946340084075928, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9443, "tokens_per_second_per_gpu": 10104.62, "total_tokens": 932290125 }, { "epoch": 0.5903975993998499, "grad_norm": 0.8860042095184326, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9444, "tokens_per_second_per_gpu": 10788.41, "total_tokens": 932389500 }, { "epoch": 0.5904601150287572, "grad_norm": 0.8818404674530029, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9445, "tokens_per_second_per_gpu": 10892.02, "total_tokens": 932487724 }, { "epoch": 0.5905226306576644, "grad_norm": 0.8946150541305542, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9446, "tokens_per_second_per_gpu": 11170.35, "total_tokens": 932582818 }, { "epoch": 0.5905851462865717, "grad_norm": 0.9144937992095947, "learning_rate": 2e-05, "loss": 0.6927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9447, "tokens_per_second_per_gpu": 10916.94, "total_tokens": 932684014 }, { "epoch": 0.5906476619154789, "grad_norm": 0.8823127150535583, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9448, "tokens_per_second_per_gpu": 10614.8, "total_tokens": 932785693 }, { "epoch": 0.5907101775443862, "grad_norm": 0.8416752815246582, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9449, "tokens_per_second_per_gpu": 10680.86, "total_tokens": 932887390 }, { "epoch": 0.5907726931732933, "grad_norm": 0.8658838272094727, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9450, "tokens_per_second_per_gpu": 10871.73, "total_tokens": 932985678 }, { "epoch": 0.5908352088022005, "grad_norm": 0.8923285007476807, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9451, "tokens_per_second_per_gpu": 10787.52, "total_tokens": 933085787 }, { "epoch": 0.5908977244311078, "grad_norm": 0.8593712449073792, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9452, "tokens_per_second_per_gpu": 10920.01, "total_tokens": 933181353 }, { "epoch": 0.590960240060015, "grad_norm": 0.9078493714332581, "learning_rate": 2e-05, "loss": 0.6582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9453, "tokens_per_second_per_gpu": 10824.84, "total_tokens": 933280495 }, { "epoch": 0.5910227556889223, "grad_norm": 0.8960784077644348, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9454, "tokens_per_second_per_gpu": 10166.06, "total_tokens": 933381526 }, { "epoch": 0.5910852713178295, "grad_norm": 0.8765454888343811, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9455, "tokens_per_second_per_gpu": 11693.87, "total_tokens": 933485913 }, { "epoch": 0.5911477869467366, "grad_norm": 0.8722847700119019, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9456, "tokens_per_second_per_gpu": 10703.81, "total_tokens": 933584634 }, { "epoch": 0.5912103025756439, "grad_norm": 0.8385326862335205, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9457, "tokens_per_second_per_gpu": 10952.83, "total_tokens": 933686479 }, { "epoch": 0.5912728182045511, "grad_norm": 0.914273738861084, "learning_rate": 2e-05, "loss": 0.6657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9458, "tokens_per_second_per_gpu": 10454.11, "total_tokens": 933784563 }, { "epoch": 0.5913353338334584, "grad_norm": 0.8744220733642578, "learning_rate": 2e-05, "loss": 0.6065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9459, "tokens_per_second_per_gpu": 10583.47, "total_tokens": 933881775 }, { "epoch": 0.5913978494623656, "grad_norm": 0.9262036085128784, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9460, "tokens_per_second_per_gpu": 9802.94, "total_tokens": 933977993 }, { "epoch": 0.5914603650912729, "grad_norm": 0.8819963932037354, "learning_rate": 2e-05, "loss": 0.6041, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9461, "tokens_per_second_per_gpu": 9595.76, "total_tokens": 934073828 }, { "epoch": 0.59152288072018, "grad_norm": 0.9029101133346558, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9462, "tokens_per_second_per_gpu": 10363.82, "total_tokens": 934175221 }, { "epoch": 0.5915853963490872, "grad_norm": 0.8684815168380737, "learning_rate": 2e-05, "loss": 0.6809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9463, "tokens_per_second_per_gpu": 11380.6, "total_tokens": 934278140 }, { "epoch": 0.5916479119779945, "grad_norm": 0.9344123005867004, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9464, "tokens_per_second_per_gpu": 11223.58, "total_tokens": 934378685 }, { "epoch": 0.5917104276069017, "grad_norm": 0.8678119778633118, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9465, "tokens_per_second_per_gpu": 10446.15, "total_tokens": 934479789 }, { "epoch": 0.591772943235809, "grad_norm": 0.9179660677909851, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9466, "tokens_per_second_per_gpu": 9365.56, "total_tokens": 934572488 }, { "epoch": 0.5918354588647162, "grad_norm": 0.8642946481704712, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9467, "tokens_per_second_per_gpu": 10725.54, "total_tokens": 934671026 }, { "epoch": 0.5918979744936234, "grad_norm": 0.8862820267677307, "learning_rate": 2e-05, "loss": 0.6655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9468, "tokens_per_second_per_gpu": 10801.76, "total_tokens": 934775057 }, { "epoch": 0.5919604901225306, "grad_norm": 0.9162336587905884, "learning_rate": 2e-05, "loss": 0.6743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9469, "tokens_per_second_per_gpu": 10560.44, "total_tokens": 934876143 }, { "epoch": 0.5920230057514378, "grad_norm": 0.87872713804245, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9470, "tokens_per_second_per_gpu": 10983.66, "total_tokens": 934979396 }, { "epoch": 0.5920855213803451, "grad_norm": 0.9341262578964233, "learning_rate": 2e-05, "loss": 0.6481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9471, "tokens_per_second_per_gpu": 11274.98, "total_tokens": 935080282 }, { "epoch": 0.5921480370092523, "grad_norm": 0.9327883720397949, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9472, "tokens_per_second_per_gpu": 10374.52, "total_tokens": 935178103 }, { "epoch": 0.5922105526381596, "grad_norm": 0.8844712972640991, "learning_rate": 2e-05, "loss": 0.5803, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9473, "tokens_per_second_per_gpu": 10269.2, "total_tokens": 935272605 }, { "epoch": 0.5922730682670667, "grad_norm": 0.927306056022644, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9474, "tokens_per_second_per_gpu": 10285.61, "total_tokens": 935369835 }, { "epoch": 0.592335583895974, "grad_norm": 0.8910142779350281, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9475, "tokens_per_second_per_gpu": 10303.43, "total_tokens": 935467606 }, { "epoch": 0.5923980995248812, "grad_norm": 0.8519225716590881, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9476, "tokens_per_second_per_gpu": 11056.02, "total_tokens": 935570210 }, { "epoch": 0.5924606151537885, "grad_norm": 0.8884062767028809, "learning_rate": 2e-05, "loss": 0.6431, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9477, "tokens_per_second_per_gpu": 10468.6, "total_tokens": 935672128 }, { "epoch": 0.5925231307826957, "grad_norm": 0.8643782734870911, "learning_rate": 2e-05, "loss": 0.5915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9478, "tokens_per_second_per_gpu": 10877.35, "total_tokens": 935769392 }, { "epoch": 0.5925856464116029, "grad_norm": 0.8638476133346558, "learning_rate": 2e-05, "loss": 0.5938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9479, "tokens_per_second_per_gpu": 10225.33, "total_tokens": 935866627 }, { "epoch": 0.5926481620405101, "grad_norm": 0.850273847579956, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9480, "tokens_per_second_per_gpu": 11091.23, "total_tokens": 935966972 }, { "epoch": 0.5927106776694173, "grad_norm": 0.9060727953910828, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9481, "tokens_per_second_per_gpu": 9767.18, "total_tokens": 936061000 }, { "epoch": 0.5927731932983246, "grad_norm": 0.8894248604774475, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9482, "tokens_per_second_per_gpu": 11073.38, "total_tokens": 936162167 }, { "epoch": 0.5928357089272318, "grad_norm": 0.8960851430892944, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9483, "tokens_per_second_per_gpu": 10497.98, "total_tokens": 936261204 }, { "epoch": 0.592898224556139, "grad_norm": 0.8826379776000977, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9484, "tokens_per_second_per_gpu": 10459.67, "total_tokens": 936361632 }, { "epoch": 0.5929607401850463, "grad_norm": 0.9299185276031494, "learning_rate": 2e-05, "loss": 0.6448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9485, "tokens_per_second_per_gpu": 10511.04, "total_tokens": 936458598 }, { "epoch": 0.5930232558139535, "grad_norm": 0.9004703760147095, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9486, "tokens_per_second_per_gpu": 10446.83, "total_tokens": 936552135 }, { "epoch": 0.5930857714428607, "grad_norm": 0.9370192289352417, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9487, "tokens_per_second_per_gpu": 10842.14, "total_tokens": 936652937 }, { "epoch": 0.5931482870717679, "grad_norm": 0.873887300491333, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9488, "tokens_per_second_per_gpu": 10929.03, "total_tokens": 936750460 }, { "epoch": 0.5932108027006752, "grad_norm": 0.859152615070343, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9489, "tokens_per_second_per_gpu": 9964.57, "total_tokens": 936846378 }, { "epoch": 0.5932733183295824, "grad_norm": 0.9069373607635498, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9490, "tokens_per_second_per_gpu": 10173.86, "total_tokens": 936943822 }, { "epoch": 0.5933358339584897, "grad_norm": 0.8928219676017761, "learning_rate": 2e-05, "loss": 0.6698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9491, "tokens_per_second_per_gpu": 10747.03, "total_tokens": 937043727 }, { "epoch": 0.5933983495873969, "grad_norm": 0.9430103302001953, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9492, "tokens_per_second_per_gpu": 10288.47, "total_tokens": 937139029 }, { "epoch": 0.593460865216304, "grad_norm": 0.976061999797821, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9493, "tokens_per_second_per_gpu": 9528.29, "total_tokens": 937232102 }, { "epoch": 0.5935233808452113, "grad_norm": 0.847590982913971, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9494, "tokens_per_second_per_gpu": 10859.72, "total_tokens": 937332086 }, { "epoch": 0.5935858964741185, "grad_norm": 0.8562767505645752, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9495, "tokens_per_second_per_gpu": 10814.37, "total_tokens": 937432833 }, { "epoch": 0.5936484121030258, "grad_norm": 0.8833588361740112, "learning_rate": 2e-05, "loss": 0.6851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9496, "tokens_per_second_per_gpu": 11004.32, "total_tokens": 937536607 }, { "epoch": 0.593710927731933, "grad_norm": 0.9521560072898865, "learning_rate": 2e-05, "loss": 0.6431, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9497, "tokens_per_second_per_gpu": 10190.69, "total_tokens": 937637564 }, { "epoch": 0.5937734433608403, "grad_norm": 0.9476543664932251, "learning_rate": 2e-05, "loss": 0.6302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9498, "tokens_per_second_per_gpu": 10452.07, "total_tokens": 937732403 }, { "epoch": 0.5938359589897474, "grad_norm": 0.92088383436203, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9499, "tokens_per_second_per_gpu": 10235.9, "total_tokens": 937829557 }, { "epoch": 0.5938984746186546, "grad_norm": 0.8874523043632507, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9500, "tokens_per_second_per_gpu": 11122.02, "total_tokens": 937927639 }, { "epoch": 0.5939609902475619, "grad_norm": 0.9404094219207764, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9501, "tokens_per_second_per_gpu": 10517.57, "total_tokens": 938026507 }, { "epoch": 0.5940235058764691, "grad_norm": 1.0255138874053955, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9502, "tokens_per_second_per_gpu": 9892.29, "total_tokens": 938121833 }, { "epoch": 0.5940860215053764, "grad_norm": 0.8951523303985596, "learning_rate": 2e-05, "loss": 0.5724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9503, "tokens_per_second_per_gpu": 9928.73, "total_tokens": 938219552 }, { "epoch": 0.5941485371342836, "grad_norm": 0.8946573138237, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9504, "tokens_per_second_per_gpu": 11068.63, "total_tokens": 938316307 }, { "epoch": 0.5942110527631908, "grad_norm": 0.8820232152938843, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9505, "tokens_per_second_per_gpu": 10340.94, "total_tokens": 938412909 }, { "epoch": 0.594273568392098, "grad_norm": 0.8977928757667542, "learning_rate": 2e-05, "loss": 0.5913, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9506, "tokens_per_second_per_gpu": 10155.36, "total_tokens": 938514822 }, { "epoch": 0.5943360840210052, "grad_norm": 0.9245261549949646, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9507, "tokens_per_second_per_gpu": 11043.48, "total_tokens": 938617016 }, { "epoch": 0.5943985996499125, "grad_norm": 0.9023211598396301, "learning_rate": 2e-05, "loss": 0.6318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9508, "tokens_per_second_per_gpu": 9748.97, "total_tokens": 938714889 }, { "epoch": 0.5944611152788197, "grad_norm": 0.8577662706375122, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9509, "tokens_per_second_per_gpu": 13423.04, "total_tokens": 938820184 }, { "epoch": 0.594523630907727, "grad_norm": 0.9150590896606445, "learning_rate": 2e-05, "loss": 0.6365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9510, "tokens_per_second_per_gpu": 12291.51, "total_tokens": 938918322 }, { "epoch": 0.5945861465366341, "grad_norm": 0.9010671973228455, "learning_rate": 2e-05, "loss": 0.6803, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9511, "tokens_per_second_per_gpu": 10632.69, "total_tokens": 939021790 }, { "epoch": 0.5946486621655414, "grad_norm": 0.9137391448020935, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9512, "tokens_per_second_per_gpu": 10831.72, "total_tokens": 939123021 }, { "epoch": 0.5947111777944486, "grad_norm": 0.8442551493644714, "learning_rate": 2e-05, "loss": 0.6195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9513, "tokens_per_second_per_gpu": 10775.47, "total_tokens": 939222512 }, { "epoch": 0.5947736934233558, "grad_norm": 0.8857799172401428, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9514, "tokens_per_second_per_gpu": 10959.58, "total_tokens": 939320408 }, { "epoch": 0.5948362090522631, "grad_norm": 0.8811872601509094, "learning_rate": 2e-05, "loss": 0.6846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9515, "tokens_per_second_per_gpu": 10858.76, "total_tokens": 939423147 }, { "epoch": 0.5948987246811703, "grad_norm": 0.8826920986175537, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9516, "tokens_per_second_per_gpu": 11444.04, "total_tokens": 939523006 }, { "epoch": 0.5949612403100775, "grad_norm": 0.896689236164093, "learning_rate": 2e-05, "loss": 0.6668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9517, "tokens_per_second_per_gpu": 10341.32, "total_tokens": 939618938 }, { "epoch": 0.5950237559389847, "grad_norm": 0.8972726464271545, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9518, "tokens_per_second_per_gpu": 10285.19, "total_tokens": 939715657 }, { "epoch": 0.595086271567892, "grad_norm": 0.8790094256401062, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9519, "tokens_per_second_per_gpu": 10351.71, "total_tokens": 939811785 }, { "epoch": 0.5951487871967992, "grad_norm": 0.8970609903335571, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9520, "tokens_per_second_per_gpu": 10485.06, "total_tokens": 939911903 }, { "epoch": 0.5952113028257064, "grad_norm": 0.8654226064682007, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9521, "tokens_per_second_per_gpu": 10644.74, "total_tokens": 940009074 }, { "epoch": 0.5952738184546137, "grad_norm": 0.8753946423530579, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9522, "tokens_per_second_per_gpu": 10600.54, "total_tokens": 940110349 }, { "epoch": 0.5953363340835209, "grad_norm": 0.8924089670181274, "learning_rate": 2e-05, "loss": 0.6211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9523, "tokens_per_second_per_gpu": 10698.41, "total_tokens": 940207956 }, { "epoch": 0.5953988497124281, "grad_norm": 0.9468743801116943, "learning_rate": 2e-05, "loss": 0.6747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9524, "tokens_per_second_per_gpu": 10506.27, "total_tokens": 940301704 }, { "epoch": 0.5954613653413353, "grad_norm": 0.9272376894950867, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9525, "tokens_per_second_per_gpu": 11189.77, "total_tokens": 940400755 }, { "epoch": 0.5955238809702426, "grad_norm": 0.8805156350135803, "learning_rate": 2e-05, "loss": 0.597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9526, "tokens_per_second_per_gpu": 10704.75, "total_tokens": 940500979 }, { "epoch": 0.5955863965991498, "grad_norm": 0.8872970938682556, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9527, "tokens_per_second_per_gpu": 10431.79, "total_tokens": 940602829 }, { "epoch": 0.595648912228057, "grad_norm": 0.8589628338813782, "learning_rate": 2e-05, "loss": 0.582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9528, "tokens_per_second_per_gpu": 10000.08, "total_tokens": 940697265 }, { "epoch": 0.5957114278569643, "grad_norm": 0.8979445695877075, "learning_rate": 2e-05, "loss": 0.6591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9529, "tokens_per_second_per_gpu": 11008.07, "total_tokens": 940797289 }, { "epoch": 0.5957739434858714, "grad_norm": 0.8795205950737, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9530, "tokens_per_second_per_gpu": 10106.56, "total_tokens": 940895723 }, { "epoch": 0.5958364591147787, "grad_norm": 0.8595429062843323, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9531, "tokens_per_second_per_gpu": 11030.6, "total_tokens": 940997552 }, { "epoch": 0.5958989747436859, "grad_norm": 0.9465442299842834, "learning_rate": 2e-05, "loss": 0.6568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9532, "tokens_per_second_per_gpu": 9458.26, "total_tokens": 941092885 }, { "epoch": 0.5959614903725932, "grad_norm": 0.9384521245956421, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9533, "tokens_per_second_per_gpu": 9517.77, "total_tokens": 941190857 }, { "epoch": 0.5960240060015004, "grad_norm": 0.8874000906944275, "learning_rate": 2e-05, "loss": 0.6577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9534, "tokens_per_second_per_gpu": 11414.39, "total_tokens": 941294331 }, { "epoch": 0.5960865216304077, "grad_norm": 0.895200252532959, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9535, "tokens_per_second_per_gpu": 10263.24, "total_tokens": 941393804 }, { "epoch": 0.5961490372593148, "grad_norm": 0.9337186217308044, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9536, "tokens_per_second_per_gpu": 10807.22, "total_tokens": 941492852 }, { "epoch": 0.596211552888222, "grad_norm": 0.9074666500091553, "learning_rate": 2e-05, "loss": 0.6425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9537, "tokens_per_second_per_gpu": 10624.82, "total_tokens": 941590748 }, { "epoch": 0.5962740685171293, "grad_norm": 0.8860523700714111, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9538, "tokens_per_second_per_gpu": 9842.86, "total_tokens": 941685871 }, { "epoch": 0.5963365841460365, "grad_norm": 0.9318339824676514, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9539, "tokens_per_second_per_gpu": 11548.96, "total_tokens": 941787617 }, { "epoch": 0.5963990997749438, "grad_norm": 0.9285314679145813, "learning_rate": 2e-05, "loss": 0.6797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9540, "tokens_per_second_per_gpu": 10426.55, "total_tokens": 941888516 }, { "epoch": 0.596461615403851, "grad_norm": 0.8759810924530029, "learning_rate": 2e-05, "loss": 0.5939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9541, "tokens_per_second_per_gpu": 10063.69, "total_tokens": 941986269 }, { "epoch": 0.5965241310327581, "grad_norm": 0.9082079529762268, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9542, "tokens_per_second_per_gpu": 10047.05, "total_tokens": 942081367 }, { "epoch": 0.5965866466616654, "grad_norm": 0.9000456929206848, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9543, "tokens_per_second_per_gpu": 10452.62, "total_tokens": 942178715 }, { "epoch": 0.5966491622905726, "grad_norm": 0.901559054851532, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9544, "tokens_per_second_per_gpu": 10611.08, "total_tokens": 942274130 }, { "epoch": 0.5967116779194799, "grad_norm": 0.9158545136451721, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9545, "tokens_per_second_per_gpu": 9867.7, "total_tokens": 942374322 }, { "epoch": 0.5967741935483871, "grad_norm": 0.898984432220459, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9546, "tokens_per_second_per_gpu": 11159.02, "total_tokens": 942473128 }, { "epoch": 0.5968367091772944, "grad_norm": 0.8879101276397705, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9547, "tokens_per_second_per_gpu": 10927.11, "total_tokens": 942570694 }, { "epoch": 0.5968992248062015, "grad_norm": 0.8687682151794434, "learning_rate": 2e-05, "loss": 0.6917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9548, "tokens_per_second_per_gpu": 10556.53, "total_tokens": 942672317 }, { "epoch": 0.5969617404351087, "grad_norm": 0.8844283819198608, "learning_rate": 2e-05, "loss": 0.6626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9549, "tokens_per_second_per_gpu": 10145.14, "total_tokens": 942773697 }, { "epoch": 0.597024256064016, "grad_norm": 0.8549471497535706, "learning_rate": 2e-05, "loss": 0.6405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9550, "tokens_per_second_per_gpu": 11124.57, "total_tokens": 942878559 }, { "epoch": 0.5970867716929232, "grad_norm": 0.8842414617538452, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9551, "tokens_per_second_per_gpu": 10412.36, "total_tokens": 942975997 }, { "epoch": 0.5971492873218305, "grad_norm": 0.872096061706543, "learning_rate": 2e-05, "loss": 0.6457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9552, "tokens_per_second_per_gpu": 10297.09, "total_tokens": 943076573 }, { "epoch": 0.5972118029507377, "grad_norm": 0.9022363424301147, "learning_rate": 2e-05, "loss": 0.6459, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9553, "tokens_per_second_per_gpu": 10973.08, "total_tokens": 943175937 }, { "epoch": 0.5972743185796449, "grad_norm": 0.8695029616355896, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9554, "tokens_per_second_per_gpu": 11478.41, "total_tokens": 943279824 }, { "epoch": 0.5973368342085521, "grad_norm": 0.8874764442443848, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9555, "tokens_per_second_per_gpu": 10217.02, "total_tokens": 943378025 }, { "epoch": 0.5973993498374593, "grad_norm": 0.9001777768135071, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9556, "tokens_per_second_per_gpu": 10721.09, "total_tokens": 943478690 }, { "epoch": 0.5974618654663666, "grad_norm": 0.9025313854217529, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9557, "tokens_per_second_per_gpu": 10323.69, "total_tokens": 943574307 }, { "epoch": 0.5975243810952738, "grad_norm": 0.9611110091209412, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9558, "tokens_per_second_per_gpu": 10433.28, "total_tokens": 943671309 }, { "epoch": 0.5975868967241811, "grad_norm": 0.906832218170166, "learning_rate": 2e-05, "loss": 0.672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9559, "tokens_per_second_per_gpu": 10479.56, "total_tokens": 943773374 }, { "epoch": 0.5976494123530883, "grad_norm": 0.9018044471740723, "learning_rate": 2e-05, "loss": 0.6657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9560, "tokens_per_second_per_gpu": 11112.72, "total_tokens": 943876732 }, { "epoch": 0.5977119279819955, "grad_norm": 0.8783947825431824, "learning_rate": 2e-05, "loss": 0.6014, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9561, "tokens_per_second_per_gpu": 10375.16, "total_tokens": 943972286 }, { "epoch": 0.5977744436109027, "grad_norm": 0.9008549451828003, "learning_rate": 2e-05, "loss": 0.6317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9562, "tokens_per_second_per_gpu": 11097.03, "total_tokens": 944074007 }, { "epoch": 0.59783695923981, "grad_norm": 0.8414729237556458, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9563, "tokens_per_second_per_gpu": 11095.41, "total_tokens": 944175216 }, { "epoch": 0.5978994748687172, "grad_norm": 0.8632100224494934, "learning_rate": 2e-05, "loss": 0.6486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9564, "tokens_per_second_per_gpu": 10677.22, "total_tokens": 944275464 }, { "epoch": 0.5979619904976244, "grad_norm": 0.9335871934890747, "learning_rate": 2e-05, "loss": 0.6796, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9565, "tokens_per_second_per_gpu": 10304.78, "total_tokens": 944370334 }, { "epoch": 0.5980245061265317, "grad_norm": 0.9244226217269897, "learning_rate": 2e-05, "loss": 0.5897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9566, "tokens_per_second_per_gpu": 9865.15, "total_tokens": 944464953 }, { "epoch": 0.5980870217554388, "grad_norm": 0.8778173327445984, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9567, "tokens_per_second_per_gpu": 10913.53, "total_tokens": 944568531 }, { "epoch": 0.5981495373843461, "grad_norm": 0.860824465751648, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9568, "tokens_per_second_per_gpu": 10399.65, "total_tokens": 944668841 }, { "epoch": 0.5982120530132533, "grad_norm": 0.896368145942688, "learning_rate": 2e-05, "loss": 0.6483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9569, "tokens_per_second_per_gpu": 11186.43, "total_tokens": 944770257 }, { "epoch": 0.5982745686421606, "grad_norm": 0.8419223427772522, "learning_rate": 2e-05, "loss": 0.6075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9570, "tokens_per_second_per_gpu": 10419.25, "total_tokens": 944868043 }, { "epoch": 0.5983370842710678, "grad_norm": 0.888788104057312, "learning_rate": 2e-05, "loss": 0.5941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9571, "tokens_per_second_per_gpu": 10135.97, "total_tokens": 944960819 }, { "epoch": 0.598399599899975, "grad_norm": 0.8817039132118225, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9572, "tokens_per_second_per_gpu": 11224.71, "total_tokens": 945062242 }, { "epoch": 0.5984621155288822, "grad_norm": 0.8950924277305603, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9573, "tokens_per_second_per_gpu": 10159.11, "total_tokens": 945156850 }, { "epoch": 0.5985246311577894, "grad_norm": 0.8756985664367676, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9574, "tokens_per_second_per_gpu": 10597.27, "total_tokens": 945259948 }, { "epoch": 0.5985871467866967, "grad_norm": 0.8471185564994812, "learning_rate": 2e-05, "loss": 0.6022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9575, "tokens_per_second_per_gpu": 9577.68, "total_tokens": 945357318 }, { "epoch": 0.5986496624156039, "grad_norm": 0.9075239300727844, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9576, "tokens_per_second_per_gpu": 10572.85, "total_tokens": 945458585 }, { "epoch": 0.5987121780445112, "grad_norm": 0.8388801217079163, "learning_rate": 2e-05, "loss": 0.5807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9577, "tokens_per_second_per_gpu": 10688.93, "total_tokens": 945558569 }, { "epoch": 0.5987746936734184, "grad_norm": 0.9170857667922974, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9578, "tokens_per_second_per_gpu": 11012.16, "total_tokens": 945659381 }, { "epoch": 0.5988372093023255, "grad_norm": 0.8921852707862854, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9579, "tokens_per_second_per_gpu": 10629.68, "total_tokens": 945754334 }, { "epoch": 0.5988997249312328, "grad_norm": 0.9393802285194397, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9580, "tokens_per_second_per_gpu": 10890.91, "total_tokens": 945856992 }, { "epoch": 0.59896224056014, "grad_norm": 0.9105345606803894, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9581, "tokens_per_second_per_gpu": 10914.61, "total_tokens": 945956807 }, { "epoch": 0.5990247561890473, "grad_norm": 0.9242872595787048, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9582, "tokens_per_second_per_gpu": 9771.46, "total_tokens": 946054170 }, { "epoch": 0.5990872718179545, "grad_norm": 0.890975296497345, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9583, "tokens_per_second_per_gpu": 11143.92, "total_tokens": 946152791 }, { "epoch": 0.5991497874468618, "grad_norm": 0.9773730635643005, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9584, "tokens_per_second_per_gpu": 10226.09, "total_tokens": 946249045 }, { "epoch": 0.5992123030757689, "grad_norm": 0.890886664390564, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9585, "tokens_per_second_per_gpu": 10174.82, "total_tokens": 946347416 }, { "epoch": 0.5992748187046761, "grad_norm": 0.8951869606971741, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9586, "tokens_per_second_per_gpu": 10693.46, "total_tokens": 946449104 }, { "epoch": 0.5993373343335834, "grad_norm": 0.8659431338310242, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9587, "tokens_per_second_per_gpu": 11341.3, "total_tokens": 946553402 }, { "epoch": 0.5993998499624906, "grad_norm": 0.87409907579422, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9588, "tokens_per_second_per_gpu": 10735.21, "total_tokens": 946657008 }, { "epoch": 0.5994623655913979, "grad_norm": 0.8834806084632874, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9589, "tokens_per_second_per_gpu": 10781.81, "total_tokens": 946758233 }, { "epoch": 0.5995248812203051, "grad_norm": 0.8920218348503113, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9590, "tokens_per_second_per_gpu": 9815.45, "total_tokens": 946851961 }, { "epoch": 0.5995873968492123, "grad_norm": 0.9172190427780151, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9591, "tokens_per_second_per_gpu": 10082.6, "total_tokens": 946949707 }, { "epoch": 0.5996499124781195, "grad_norm": 0.8960210084915161, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9592, "tokens_per_second_per_gpu": 10064.53, "total_tokens": 947046324 }, { "epoch": 0.5997124281070267, "grad_norm": 0.8862524032592773, "learning_rate": 2e-05, "loss": 0.6937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9593, "tokens_per_second_per_gpu": 11080.61, "total_tokens": 947150338 }, { "epoch": 0.599774943735934, "grad_norm": 0.8804299831390381, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9594, "tokens_per_second_per_gpu": 10725.34, "total_tokens": 947251103 }, { "epoch": 0.5998374593648412, "grad_norm": 0.9266701340675354, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9595, "tokens_per_second_per_gpu": 10911.44, "total_tokens": 947353863 }, { "epoch": 0.5998999749937485, "grad_norm": 0.9184728264808655, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9596, "tokens_per_second_per_gpu": 11051.19, "total_tokens": 947454558 }, { "epoch": 0.5999624906226556, "grad_norm": 0.9144945740699768, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9597, "tokens_per_second_per_gpu": 10227.67, "total_tokens": 947551423 }, { "epoch": 0.6000250062515629, "grad_norm": 0.8858734965324402, "learning_rate": 2e-05, "loss": 0.6817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9598, "tokens_per_second_per_gpu": 11549.61, "total_tokens": 947653694 }, { "epoch": 0.6000875218804701, "grad_norm": 0.9310133457183838, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9599, "tokens_per_second_per_gpu": 9419.77, "total_tokens": 947746380 }, { "epoch": 0.6001500375093773, "grad_norm": 0.8974087834358215, "learning_rate": 2e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9600, "tokens_per_second_per_gpu": 10363.37, "total_tokens": 947842486 }, { "epoch": 0.6002125531382846, "grad_norm": 1.0322396755218506, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9601, "tokens_per_second_per_gpu": 10516.73, "total_tokens": 947940924 }, { "epoch": 0.6002750687671918, "grad_norm": 0.8839902281761169, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9602, "tokens_per_second_per_gpu": 10289.33, "total_tokens": 948041034 }, { "epoch": 0.6003375843960991, "grad_norm": 0.8637564778327942, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9603, "tokens_per_second_per_gpu": 11666.06, "total_tokens": 948144539 }, { "epoch": 0.6004001000250062, "grad_norm": 0.9108986258506775, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9604, "tokens_per_second_per_gpu": 10662.88, "total_tokens": 948241167 }, { "epoch": 0.6004626156539135, "grad_norm": 0.8708287477493286, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9605, "tokens_per_second_per_gpu": 11468.09, "total_tokens": 948341536 }, { "epoch": 0.6005251312828207, "grad_norm": 0.8764418363571167, "learning_rate": 2e-05, "loss": 0.6216, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9606, "tokens_per_second_per_gpu": 10907.17, "total_tokens": 948440216 }, { "epoch": 0.600587646911728, "grad_norm": 0.9103885293006897, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9607, "tokens_per_second_per_gpu": 9752.76, "total_tokens": 948532973 }, { "epoch": 0.6006501625406352, "grad_norm": 0.858055591583252, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9608, "tokens_per_second_per_gpu": 10833.86, "total_tokens": 948636469 }, { "epoch": 0.6007126781695424, "grad_norm": 0.8994773030281067, "learning_rate": 2e-05, "loss": 0.6572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9609, "tokens_per_second_per_gpu": 10424.84, "total_tokens": 948733271 }, { "epoch": 0.6007751937984496, "grad_norm": 0.8960517644882202, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9610, "tokens_per_second_per_gpu": 10456.95, "total_tokens": 948830794 }, { "epoch": 0.6008377094273568, "grad_norm": 0.9280921816825867, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9611, "tokens_per_second_per_gpu": 10506.95, "total_tokens": 948926829 }, { "epoch": 0.6009002250562641, "grad_norm": 0.8762468695640564, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9612, "tokens_per_second_per_gpu": 10016.95, "total_tokens": 949024989 }, { "epoch": 0.6009627406851713, "grad_norm": 0.8721242547035217, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9613, "tokens_per_second_per_gpu": 10131.32, "total_tokens": 949121060 }, { "epoch": 0.6010252563140785, "grad_norm": 0.8771994709968567, "learning_rate": 2e-05, "loss": 0.5826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9614, "tokens_per_second_per_gpu": 10547.71, "total_tokens": 949217144 }, { "epoch": 0.6010877719429858, "grad_norm": 0.9221134781837463, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9615, "tokens_per_second_per_gpu": 11174.23, "total_tokens": 949321008 }, { "epoch": 0.6011502875718929, "grad_norm": 0.9076821208000183, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9616, "tokens_per_second_per_gpu": 10658.72, "total_tokens": 949418555 }, { "epoch": 0.6012128032008002, "grad_norm": 0.882520854473114, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9617, "tokens_per_second_per_gpu": 9942.43, "total_tokens": 949513763 }, { "epoch": 0.6012753188297074, "grad_norm": 0.8937317132949829, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9618, "tokens_per_second_per_gpu": 10640.61, "total_tokens": 949615480 }, { "epoch": 0.6013378344586147, "grad_norm": 0.8916881084442139, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9619, "tokens_per_second_per_gpu": 9968.15, "total_tokens": 949710799 }, { "epoch": 0.6014003500875219, "grad_norm": 0.8554118275642395, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9620, "tokens_per_second_per_gpu": 11079.16, "total_tokens": 949813661 }, { "epoch": 0.6014628657164292, "grad_norm": 0.9049755930900574, "learning_rate": 2e-05, "loss": 0.5928, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9621, "tokens_per_second_per_gpu": 10065.4, "total_tokens": 949908919 }, { "epoch": 0.6015253813453363, "grad_norm": 0.9290140867233276, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9622, "tokens_per_second_per_gpu": 11090.65, "total_tokens": 950013844 }, { "epoch": 0.6015878969742435, "grad_norm": 0.8596907258033752, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9623, "tokens_per_second_per_gpu": 10388.34, "total_tokens": 950114335 }, { "epoch": 0.6016504126031508, "grad_norm": 0.9319398403167725, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9624, "tokens_per_second_per_gpu": 11087.47, "total_tokens": 950216680 }, { "epoch": 0.601712928232058, "grad_norm": 0.8780888319015503, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9625, "tokens_per_second_per_gpu": 10507.0, "total_tokens": 950314905 }, { "epoch": 0.6017754438609653, "grad_norm": 0.908641517162323, "learning_rate": 2e-05, "loss": 0.6652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9626, "tokens_per_second_per_gpu": 10976.16, "total_tokens": 950412888 }, { "epoch": 0.6018379594898725, "grad_norm": 0.8732320070266724, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9627, "tokens_per_second_per_gpu": 10424.92, "total_tokens": 950513487 }, { "epoch": 0.6019004751187796, "grad_norm": 0.8449411392211914, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9628, "tokens_per_second_per_gpu": 11469.15, "total_tokens": 950616763 }, { "epoch": 0.6019629907476869, "grad_norm": 0.8669173717498779, "learning_rate": 2e-05, "loss": 0.5912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9629, "tokens_per_second_per_gpu": 10601.09, "total_tokens": 950715226 }, { "epoch": 0.6020255063765941, "grad_norm": 0.881860077381134, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9630, "tokens_per_second_per_gpu": 10625.2, "total_tokens": 950816341 }, { "epoch": 0.6020880220055014, "grad_norm": 0.8614519238471985, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9631, "tokens_per_second_per_gpu": 10587.04, "total_tokens": 950918198 }, { "epoch": 0.6021505376344086, "grad_norm": 0.8995200991630554, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9632, "tokens_per_second_per_gpu": 11244.72, "total_tokens": 951020416 }, { "epoch": 0.6022130532633159, "grad_norm": 0.9085005521774292, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9633, "tokens_per_second_per_gpu": 10221.84, "total_tokens": 951116447 }, { "epoch": 0.602275568892223, "grad_norm": 0.8907333612442017, "learning_rate": 2e-05, "loss": 0.6664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9634, "tokens_per_second_per_gpu": 10305.13, "total_tokens": 951213315 }, { "epoch": 0.6023380845211302, "grad_norm": 0.828565239906311, "learning_rate": 2e-05, "loss": 0.5693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9635, "tokens_per_second_per_gpu": 10505.1, "total_tokens": 951314276 }, { "epoch": 0.6024006001500375, "grad_norm": 0.9016339778900146, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9636, "tokens_per_second_per_gpu": 10251.04, "total_tokens": 951410034 }, { "epoch": 0.6024631157789447, "grad_norm": 0.918647050857544, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9637, "tokens_per_second_per_gpu": 10278.61, "total_tokens": 951507802 }, { "epoch": 0.602525631407852, "grad_norm": 0.8688751459121704, "learning_rate": 2e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9638, "tokens_per_second_per_gpu": 10823.86, "total_tokens": 951610204 }, { "epoch": 0.6025881470367592, "grad_norm": 0.8683454394340515, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9639, "tokens_per_second_per_gpu": 10582.41, "total_tokens": 951711080 }, { "epoch": 0.6026506626656665, "grad_norm": 0.8972373008728027, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9640, "tokens_per_second_per_gpu": 10267.3, "total_tokens": 951805040 }, { "epoch": 0.6027131782945736, "grad_norm": 0.8893028497695923, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9641, "tokens_per_second_per_gpu": 9887.94, "total_tokens": 951902049 }, { "epoch": 0.6027756939234808, "grad_norm": 0.9099754691123962, "learning_rate": 2e-05, "loss": 0.6011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9642, "tokens_per_second_per_gpu": 10120.0, "total_tokens": 951996754 }, { "epoch": 0.6028382095523881, "grad_norm": 0.9150661826133728, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9643, "tokens_per_second_per_gpu": 10449.05, "total_tokens": 952095512 }, { "epoch": 0.6029007251812953, "grad_norm": 0.9319400787353516, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9644, "tokens_per_second_per_gpu": 10088.8, "total_tokens": 952190684 }, { "epoch": 0.6029632408102026, "grad_norm": 0.9100475311279297, "learning_rate": 2e-05, "loss": 0.6412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9645, "tokens_per_second_per_gpu": 10736.18, "total_tokens": 952288498 }, { "epoch": 0.6030257564391098, "grad_norm": 0.9082328081130981, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9646, "tokens_per_second_per_gpu": 10469.03, "total_tokens": 952385660 }, { "epoch": 0.603088272068017, "grad_norm": 0.9399042725563049, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9647, "tokens_per_second_per_gpu": 10877.01, "total_tokens": 952484314 }, { "epoch": 0.6031507876969242, "grad_norm": 0.894699215888977, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9648, "tokens_per_second_per_gpu": 10311.63, "total_tokens": 952581066 }, { "epoch": 0.6032133033258315, "grad_norm": 0.8760000467300415, "learning_rate": 2e-05, "loss": 0.6019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9649, "tokens_per_second_per_gpu": 10517.38, "total_tokens": 952678180 }, { "epoch": 0.6032758189547387, "grad_norm": 0.8727879524230957, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9650, "tokens_per_second_per_gpu": 10274.2, "total_tokens": 952777355 }, { "epoch": 0.6033383345836459, "grad_norm": 0.8948834538459778, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9651, "tokens_per_second_per_gpu": 9934.92, "total_tokens": 952874059 }, { "epoch": 0.6034008502125532, "grad_norm": 0.9185671210289001, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9652, "tokens_per_second_per_gpu": 11011.5, "total_tokens": 952977194 }, { "epoch": 0.6034633658414603, "grad_norm": 0.8960217237472534, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9653, "tokens_per_second_per_gpu": 11412.37, "total_tokens": 953079232 }, { "epoch": 0.6035258814703676, "grad_norm": 0.8746275901794434, "learning_rate": 2e-05, "loss": 0.5948, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9654, "tokens_per_second_per_gpu": 10520.44, "total_tokens": 953178607 }, { "epoch": 0.6035883970992748, "grad_norm": 0.9548124074935913, "learning_rate": 2e-05, "loss": 0.6724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9655, "tokens_per_second_per_gpu": 10878.03, "total_tokens": 953277080 }, { "epoch": 0.603650912728182, "grad_norm": 0.8974525332450867, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9656, "tokens_per_second_per_gpu": 10631.37, "total_tokens": 953377932 }, { "epoch": 0.6037134283570893, "grad_norm": 0.8835238218307495, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9657, "tokens_per_second_per_gpu": 10423.28, "total_tokens": 953474779 }, { "epoch": 0.6037759439859965, "grad_norm": 0.8667277693748474, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9658, "tokens_per_second_per_gpu": 10833.44, "total_tokens": 953577103 }, { "epoch": 0.6038384596149037, "grad_norm": 0.8683804273605347, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9659, "tokens_per_second_per_gpu": 10126.01, "total_tokens": 953676009 }, { "epoch": 0.6039009752438109, "grad_norm": 0.8931612968444824, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9660, "tokens_per_second_per_gpu": 10897.6, "total_tokens": 953777492 }, { "epoch": 0.6039634908727182, "grad_norm": 0.8745473027229309, "learning_rate": 2e-05, "loss": 0.6226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9661, "tokens_per_second_per_gpu": 11065.13, "total_tokens": 953877703 }, { "epoch": 0.6040260065016254, "grad_norm": 0.898446798324585, "learning_rate": 2e-05, "loss": 0.6286, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9662, "tokens_per_second_per_gpu": 10910.86, "total_tokens": 953980098 }, { "epoch": 0.6040885221305327, "grad_norm": 0.8755710124969482, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9663, "tokens_per_second_per_gpu": 10686.11, "total_tokens": 954082357 }, { "epoch": 0.6041510377594399, "grad_norm": 0.9224533438682556, "learning_rate": 2e-05, "loss": 0.6692, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9664, "tokens_per_second_per_gpu": 10410.03, "total_tokens": 954184552 }, { "epoch": 0.604213553388347, "grad_norm": 0.87521892786026, "learning_rate": 2e-05, "loss": 0.6002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9665, "tokens_per_second_per_gpu": 11276.5, "total_tokens": 954285001 }, { "epoch": 0.6042760690172543, "grad_norm": 0.8765724301338196, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9666, "tokens_per_second_per_gpu": 10296.31, "total_tokens": 954384821 }, { "epoch": 0.6043385846461615, "grad_norm": 0.8639251589775085, "learning_rate": 2e-05, "loss": 0.606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9667, "tokens_per_second_per_gpu": 10642.87, "total_tokens": 954485739 }, { "epoch": 0.6044011002750688, "grad_norm": 0.8996058702468872, "learning_rate": 2e-05, "loss": 0.5888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9668, "tokens_per_second_per_gpu": 10407.55, "total_tokens": 954578843 }, { "epoch": 0.604463615903976, "grad_norm": 0.9075015187263489, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9669, "tokens_per_second_per_gpu": 9539.85, "total_tokens": 954675198 }, { "epoch": 0.6045261315328833, "grad_norm": 0.9048132300376892, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9670, "tokens_per_second_per_gpu": 9573.85, "total_tokens": 954767757 }, { "epoch": 0.6045886471617904, "grad_norm": 0.8775617480278015, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9671, "tokens_per_second_per_gpu": 10245.82, "total_tokens": 954865578 }, { "epoch": 0.6046511627906976, "grad_norm": 0.9169535636901855, "learning_rate": 2e-05, "loss": 0.5894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9672, "tokens_per_second_per_gpu": 10668.29, "total_tokens": 954961707 }, { "epoch": 0.6047136784196049, "grad_norm": 0.9090615510940552, "learning_rate": 2e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9673, "tokens_per_second_per_gpu": 9599.62, "total_tokens": 955055714 }, { "epoch": 0.6047761940485121, "grad_norm": 0.8944203853607178, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9674, "tokens_per_second_per_gpu": 10079.92, "total_tokens": 955153842 }, { "epoch": 0.6048387096774194, "grad_norm": 0.8903552293777466, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9675, "tokens_per_second_per_gpu": 10362.44, "total_tokens": 955250362 }, { "epoch": 0.6049012253063266, "grad_norm": 0.8533310890197754, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9676, "tokens_per_second_per_gpu": 11489.06, "total_tokens": 955354709 }, { "epoch": 0.6049637409352339, "grad_norm": 0.8519286513328552, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9677, "tokens_per_second_per_gpu": 11175.71, "total_tokens": 955453383 }, { "epoch": 0.605026256564141, "grad_norm": 0.9418904781341553, "learning_rate": 2e-05, "loss": 0.6369, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9678, "tokens_per_second_per_gpu": 11070.28, "total_tokens": 955551080 }, { "epoch": 0.6050887721930482, "grad_norm": 0.930980920791626, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9679, "tokens_per_second_per_gpu": 10222.74, "total_tokens": 955649358 }, { "epoch": 0.6051512878219555, "grad_norm": 0.8957018852233887, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9680, "tokens_per_second_per_gpu": 11261.82, "total_tokens": 955751606 }, { "epoch": 0.6052138034508627, "grad_norm": 0.9469910264015198, "learning_rate": 2e-05, "loss": 0.6941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9681, "tokens_per_second_per_gpu": 10601.47, "total_tokens": 955851431 }, { "epoch": 0.60527631907977, "grad_norm": 0.8872244358062744, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9682, "tokens_per_second_per_gpu": 10251.04, "total_tokens": 955947481 }, { "epoch": 0.6053388347086772, "grad_norm": 0.8878669738769531, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9683, "tokens_per_second_per_gpu": 10139.22, "total_tokens": 956043581 }, { "epoch": 0.6054013503375844, "grad_norm": 0.8625959753990173, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9684, "tokens_per_second_per_gpu": 11050.81, "total_tokens": 956144952 }, { "epoch": 0.6054638659664916, "grad_norm": 0.885446310043335, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9685, "tokens_per_second_per_gpu": 10425.99, "total_tokens": 956244679 }, { "epoch": 0.6055263815953988, "grad_norm": 0.8800164461135864, "learning_rate": 2e-05, "loss": 0.6037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9686, "tokens_per_second_per_gpu": 11111.14, "total_tokens": 956347697 }, { "epoch": 0.6055888972243061, "grad_norm": 0.8950806260108948, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9687, "tokens_per_second_per_gpu": 10873.37, "total_tokens": 956446064 }, { "epoch": 0.6056514128532133, "grad_norm": 0.9112827181816101, "learning_rate": 2e-05, "loss": 0.6394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9688, "tokens_per_second_per_gpu": 9748.37, "total_tokens": 956542220 }, { "epoch": 0.6057139284821206, "grad_norm": 0.8992576003074646, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9689, "tokens_per_second_per_gpu": 10746.33, "total_tokens": 956639289 }, { "epoch": 0.6057764441110277, "grad_norm": 0.8767381310462952, "learning_rate": 2e-05, "loss": 0.6604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9690, "tokens_per_second_per_gpu": 11597.88, "total_tokens": 956745347 }, { "epoch": 0.605838959739935, "grad_norm": 0.9168050289154053, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9691, "tokens_per_second_per_gpu": 10616.84, "total_tokens": 956846989 }, { "epoch": 0.6059014753688422, "grad_norm": 0.9099111557006836, "learning_rate": 2e-05, "loss": 0.6634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9692, "tokens_per_second_per_gpu": 10598.87, "total_tokens": 956950039 }, { "epoch": 0.6059639909977494, "grad_norm": 0.9140846133232117, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9693, "tokens_per_second_per_gpu": 10414.09, "total_tokens": 957046886 }, { "epoch": 0.6060265066266567, "grad_norm": 0.8933992385864258, "learning_rate": 2e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9694, "tokens_per_second_per_gpu": 11372.72, "total_tokens": 957150692 }, { "epoch": 0.6060890222555639, "grad_norm": 0.961822509765625, "learning_rate": 2e-05, "loss": 0.676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9695, "tokens_per_second_per_gpu": 10612.96, "total_tokens": 957249542 }, { "epoch": 0.6061515378844711, "grad_norm": 0.8624528050422668, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9696, "tokens_per_second_per_gpu": 11038.83, "total_tokens": 957351248 }, { "epoch": 0.6062140535133783, "grad_norm": 0.9236396551132202, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9697, "tokens_per_second_per_gpu": 10296.94, "total_tokens": 957448654 }, { "epoch": 0.6062765691422856, "grad_norm": 0.8850570917129517, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9698, "tokens_per_second_per_gpu": 10666.69, "total_tokens": 957548309 }, { "epoch": 0.6063390847711928, "grad_norm": 0.8934128880500793, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9699, "tokens_per_second_per_gpu": 10691.48, "total_tokens": 957649887 }, { "epoch": 0.6064016004001, "grad_norm": 0.9104008674621582, "learning_rate": 2e-05, "loss": 0.5974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9700, "tokens_per_second_per_gpu": 9360.28, "total_tokens": 957743020 }, { "epoch": 0.6064641160290073, "grad_norm": 0.8804425001144409, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9701, "tokens_per_second_per_gpu": 9864.69, "total_tokens": 957841538 }, { "epoch": 0.6065266316579144, "grad_norm": 0.9498323202133179, "learning_rate": 2e-05, "loss": 0.6219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9702, "tokens_per_second_per_gpu": 10693.26, "total_tokens": 957938410 }, { "epoch": 0.6065891472868217, "grad_norm": 0.8737659454345703, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9703, "tokens_per_second_per_gpu": 10863.35, "total_tokens": 958038742 }, { "epoch": 0.6066516629157289, "grad_norm": 0.9516075253486633, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9704, "tokens_per_second_per_gpu": 10705.8, "total_tokens": 958136247 }, { "epoch": 0.6067141785446362, "grad_norm": 0.9030907154083252, "learning_rate": 2e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9705, "tokens_per_second_per_gpu": 9940.77, "total_tokens": 958233565 }, { "epoch": 0.6067766941735434, "grad_norm": 0.8807452321052551, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9706, "tokens_per_second_per_gpu": 11119.68, "total_tokens": 958332158 }, { "epoch": 0.6068392098024507, "grad_norm": 0.9067118763923645, "learning_rate": 2e-05, "loss": 0.6641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9707, "tokens_per_second_per_gpu": 10084.88, "total_tokens": 958430865 }, { "epoch": 0.6069017254313578, "grad_norm": 0.9290612936019897, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9708, "tokens_per_second_per_gpu": 11347.43, "total_tokens": 958529610 }, { "epoch": 0.606964241060265, "grad_norm": 0.9426256418228149, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9709, "tokens_per_second_per_gpu": 11726.86, "total_tokens": 958633437 }, { "epoch": 0.6070267566891723, "grad_norm": 0.8908039927482605, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9710, "tokens_per_second_per_gpu": 10754.84, "total_tokens": 958730553 }, { "epoch": 0.6070892723180795, "grad_norm": 0.8865000605583191, "learning_rate": 2e-05, "loss": 0.6812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9711, "tokens_per_second_per_gpu": 10806.75, "total_tokens": 958831951 }, { "epoch": 0.6071517879469868, "grad_norm": 0.8886339664459229, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9712, "tokens_per_second_per_gpu": 10235.86, "total_tokens": 958928931 }, { "epoch": 0.607214303575894, "grad_norm": 0.8731067180633545, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9713, "tokens_per_second_per_gpu": 11156.93, "total_tokens": 959030960 }, { "epoch": 0.6072768192048013, "grad_norm": 0.9093220233917236, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9714, "tokens_per_second_per_gpu": 10716.81, "total_tokens": 959133398 }, { "epoch": 0.6073393348337084, "grad_norm": 0.9387540221214294, "learning_rate": 2e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9715, "tokens_per_second_per_gpu": 10157.29, "total_tokens": 959230049 }, { "epoch": 0.6074018504626156, "grad_norm": 0.8592262864112854, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9716, "tokens_per_second_per_gpu": 10674.87, "total_tokens": 959329265 }, { "epoch": 0.6074643660915229, "grad_norm": 0.8909226059913635, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9717, "tokens_per_second_per_gpu": 10282.81, "total_tokens": 959430563 }, { "epoch": 0.6075268817204301, "grad_norm": 0.9107819199562073, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9718, "tokens_per_second_per_gpu": 10536.2, "total_tokens": 959525951 }, { "epoch": 0.6075893973493374, "grad_norm": 0.887172281742096, "learning_rate": 2e-05, "loss": 0.6, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9719, "tokens_per_second_per_gpu": 10115.94, "total_tokens": 959620575 }, { "epoch": 0.6076519129782446, "grad_norm": 0.8887537121772766, "learning_rate": 2e-05, "loss": 0.6461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9720, "tokens_per_second_per_gpu": 10565.03, "total_tokens": 959718591 }, { "epoch": 0.6077144286071517, "grad_norm": 0.9208866357803345, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9721, "tokens_per_second_per_gpu": 10530.81, "total_tokens": 959815425 }, { "epoch": 0.607776944236059, "grad_norm": 0.8918617963790894, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9722, "tokens_per_second_per_gpu": 10422.01, "total_tokens": 959912989 }, { "epoch": 0.6078394598649662, "grad_norm": 0.9222100973129272, "learning_rate": 2e-05, "loss": 0.6049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9723, "tokens_per_second_per_gpu": 9893.33, "total_tokens": 960008143 }, { "epoch": 0.6079019754938735, "grad_norm": 0.8760676980018616, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9724, "tokens_per_second_per_gpu": 10929.34, "total_tokens": 960106698 }, { "epoch": 0.6079644911227807, "grad_norm": 0.8988955616950989, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9725, "tokens_per_second_per_gpu": 10207.67, "total_tokens": 960206669 }, { "epoch": 0.608027006751688, "grad_norm": 0.9038499593734741, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9726, "tokens_per_second_per_gpu": 9170.69, "total_tokens": 960302065 }, { "epoch": 0.6080895223805951, "grad_norm": 0.9335921406745911, "learning_rate": 2e-05, "loss": 0.6185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9727, "tokens_per_second_per_gpu": 9857.92, "total_tokens": 960397740 }, { "epoch": 0.6081520380095023, "grad_norm": 0.9276275038719177, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9728, "tokens_per_second_per_gpu": 10920.64, "total_tokens": 960496371 }, { "epoch": 0.6082145536384096, "grad_norm": 0.9015493392944336, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9729, "tokens_per_second_per_gpu": 10831.05, "total_tokens": 960594215 }, { "epoch": 0.6082770692673168, "grad_norm": 0.870427131652832, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9730, "tokens_per_second_per_gpu": 10436.96, "total_tokens": 960695847 }, { "epoch": 0.6083395848962241, "grad_norm": 0.878145694732666, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9731, "tokens_per_second_per_gpu": 10432.88, "total_tokens": 960795909 }, { "epoch": 0.6084021005251313, "grad_norm": 0.9297255873680115, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9732, "tokens_per_second_per_gpu": 10132.1, "total_tokens": 960893910 }, { "epoch": 0.6084646161540385, "grad_norm": 0.8754585385322571, "learning_rate": 2e-05, "loss": 0.5837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9733, "tokens_per_second_per_gpu": 9743.1, "total_tokens": 960988241 }, { "epoch": 0.6085271317829457, "grad_norm": 0.8924839496612549, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9734, "tokens_per_second_per_gpu": 11070.12, "total_tokens": 961090212 }, { "epoch": 0.608589647411853, "grad_norm": 0.9419102072715759, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9735, "tokens_per_second_per_gpu": 9464.73, "total_tokens": 961186911 }, { "epoch": 0.6086521630407602, "grad_norm": 0.8794421553611755, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9736, "tokens_per_second_per_gpu": 9628.23, "total_tokens": 961286375 }, { "epoch": 0.6087146786696674, "grad_norm": 0.9361518025398254, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9737, "tokens_per_second_per_gpu": 10896.53, "total_tokens": 961386021 }, { "epoch": 0.6087771942985747, "grad_norm": 0.8835973143577576, "learning_rate": 2e-05, "loss": 0.6318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9738, "tokens_per_second_per_gpu": 10500.94, "total_tokens": 961485025 }, { "epoch": 0.6088397099274818, "grad_norm": 0.8938974738121033, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9739, "tokens_per_second_per_gpu": 9727.17, "total_tokens": 961581740 }, { "epoch": 0.6089022255563891, "grad_norm": 0.9031102061271667, "learning_rate": 2e-05, "loss": 0.66, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9740, "tokens_per_second_per_gpu": 10257.65, "total_tokens": 961682181 }, { "epoch": 0.6089647411852963, "grad_norm": 0.8539242148399353, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9741, "tokens_per_second_per_gpu": 10911.32, "total_tokens": 961784098 }, { "epoch": 0.6090272568142036, "grad_norm": 0.9047542810440063, "learning_rate": 2e-05, "loss": 0.5933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9742, "tokens_per_second_per_gpu": 9463.71, "total_tokens": 961880034 }, { "epoch": 0.6090897724431108, "grad_norm": 0.9052282571792603, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9743, "tokens_per_second_per_gpu": 10574.94, "total_tokens": 961978977 }, { "epoch": 0.609152288072018, "grad_norm": 0.8903628587722778, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9744, "tokens_per_second_per_gpu": 10714.86, "total_tokens": 962077960 }, { "epoch": 0.6092148037009252, "grad_norm": 0.893916130065918, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9745, "tokens_per_second_per_gpu": 11235.3, "total_tokens": 962178765 }, { "epoch": 0.6092773193298324, "grad_norm": 0.8814464807510376, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9746, "tokens_per_second_per_gpu": 14501.16, "total_tokens": 962279440 }, { "epoch": 0.6093398349587397, "grad_norm": 0.9306403994560242, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9747, "tokens_per_second_per_gpu": 9973.42, "total_tokens": 962373813 }, { "epoch": 0.6094023505876469, "grad_norm": 0.8908021450042725, "learning_rate": 2e-05, "loss": 0.6458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9748, "tokens_per_second_per_gpu": 11377.39, "total_tokens": 962476281 }, { "epoch": 0.6094648662165542, "grad_norm": 0.9892479181289673, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9749, "tokens_per_second_per_gpu": 10608.17, "total_tokens": 962574849 }, { "epoch": 0.6095273818454614, "grad_norm": 0.8922898173332214, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9750, "tokens_per_second_per_gpu": 10718.94, "total_tokens": 962676325 }, { "epoch": 0.6095898974743686, "grad_norm": 0.8422688245773315, "learning_rate": 2e-05, "loss": 0.6247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9751, "tokens_per_second_per_gpu": 11014.82, "total_tokens": 962775317 }, { "epoch": 0.6096524131032758, "grad_norm": 0.9086449146270752, "learning_rate": 2e-05, "loss": 0.611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9752, "tokens_per_second_per_gpu": 10700.84, "total_tokens": 962875349 }, { "epoch": 0.609714928732183, "grad_norm": 0.8815387487411499, "learning_rate": 2e-05, "loss": 0.594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9753, "tokens_per_second_per_gpu": 10616.02, "total_tokens": 962975902 }, { "epoch": 0.6097774443610903, "grad_norm": 0.8817440271377563, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9754, "tokens_per_second_per_gpu": 10927.65, "total_tokens": 963075701 }, { "epoch": 0.6098399599899975, "grad_norm": 0.9493213295936584, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9755, "tokens_per_second_per_gpu": 10371.96, "total_tokens": 963170560 }, { "epoch": 0.6099024756189048, "grad_norm": 0.8896973729133606, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9756, "tokens_per_second_per_gpu": 10054.47, "total_tokens": 963268212 }, { "epoch": 0.609964991247812, "grad_norm": 0.9222592115402222, "learning_rate": 2e-05, "loss": 0.648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9757, "tokens_per_second_per_gpu": 9914.31, "total_tokens": 963365033 }, { "epoch": 0.6100275068767191, "grad_norm": 0.8924468755722046, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9758, "tokens_per_second_per_gpu": 11470.48, "total_tokens": 963467863 }, { "epoch": 0.6100900225056264, "grad_norm": 0.8955541849136353, "learning_rate": 2e-05, "loss": 0.6009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9759, "tokens_per_second_per_gpu": 10287.82, "total_tokens": 963566240 }, { "epoch": 0.6101525381345336, "grad_norm": 0.9150742292404175, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9760, "tokens_per_second_per_gpu": 10805.55, "total_tokens": 963664188 }, { "epoch": 0.6102150537634409, "grad_norm": 0.8498182892799377, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9761, "tokens_per_second_per_gpu": 10525.15, "total_tokens": 963764572 }, { "epoch": 0.6102775693923481, "grad_norm": 0.8942632675170898, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9762, "tokens_per_second_per_gpu": 10639.64, "total_tokens": 963866368 }, { "epoch": 0.6103400850212554, "grad_norm": 0.9000500440597534, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9763, "tokens_per_second_per_gpu": 9234.29, "total_tokens": 963961677 }, { "epoch": 0.6104026006501625, "grad_norm": 0.8606049418449402, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9764, "tokens_per_second_per_gpu": 10783.58, "total_tokens": 964064505 }, { "epoch": 0.6104651162790697, "grad_norm": 0.8838011622428894, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9765, "tokens_per_second_per_gpu": 10963.39, "total_tokens": 964164847 }, { "epoch": 0.610527631907977, "grad_norm": 0.846813976764679, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9766, "tokens_per_second_per_gpu": 11237.0, "total_tokens": 964268951 }, { "epoch": 0.6105901475368842, "grad_norm": 0.9655773639678955, "learning_rate": 2e-05, "loss": 0.5933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9767, "tokens_per_second_per_gpu": 10792.24, "total_tokens": 964362617 }, { "epoch": 0.6106526631657915, "grad_norm": 0.8577547669410706, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9768, "tokens_per_second_per_gpu": 10805.65, "total_tokens": 964463893 }, { "epoch": 0.6107151787946987, "grad_norm": 0.8985162377357483, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9769, "tokens_per_second_per_gpu": 10715.06, "total_tokens": 964560676 }, { "epoch": 0.6107776944236059, "grad_norm": 0.8835681676864624, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9770, "tokens_per_second_per_gpu": 11348.42, "total_tokens": 964662443 }, { "epoch": 0.6108402100525131, "grad_norm": 0.873407781124115, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9771, "tokens_per_second_per_gpu": 10437.42, "total_tokens": 964761753 }, { "epoch": 0.6109027256814203, "grad_norm": 0.8515611886978149, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9772, "tokens_per_second_per_gpu": 10380.57, "total_tokens": 964863322 }, { "epoch": 0.6109652413103276, "grad_norm": 0.8960371017456055, "learning_rate": 2e-05, "loss": 0.6022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9773, "tokens_per_second_per_gpu": 10119.54, "total_tokens": 964960628 }, { "epoch": 0.6110277569392348, "grad_norm": 1.1771085262298584, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9774, "tokens_per_second_per_gpu": 10071.35, "total_tokens": 965056556 }, { "epoch": 0.6110902725681421, "grad_norm": 0.8491919040679932, "learning_rate": 2e-05, "loss": 0.603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9775, "tokens_per_second_per_gpu": 10936.67, "total_tokens": 965160258 }, { "epoch": 0.6111527881970492, "grad_norm": 0.9482218027114868, "learning_rate": 2e-05, "loss": 0.6502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9776, "tokens_per_second_per_gpu": 10733.81, "total_tokens": 965257987 }, { "epoch": 0.6112153038259565, "grad_norm": 0.9462401270866394, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9777, "tokens_per_second_per_gpu": 11190.58, "total_tokens": 965359144 }, { "epoch": 0.6112778194548637, "grad_norm": 0.911906898021698, "learning_rate": 2e-05, "loss": 0.5904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9778, "tokens_per_second_per_gpu": 10967.79, "total_tokens": 965457206 }, { "epoch": 0.611340335083771, "grad_norm": 0.9119093418121338, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9779, "tokens_per_second_per_gpu": 10401.72, "total_tokens": 965558849 }, { "epoch": 0.6114028507126782, "grad_norm": 0.8921203017234802, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9780, "tokens_per_second_per_gpu": 10712.07, "total_tokens": 965658027 }, { "epoch": 0.6114653663415854, "grad_norm": 0.8801081776618958, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9781, "tokens_per_second_per_gpu": 10384.58, "total_tokens": 965756599 }, { "epoch": 0.6115278819704926, "grad_norm": 0.8932783603668213, "learning_rate": 2e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9782, "tokens_per_second_per_gpu": 10395.04, "total_tokens": 965854623 }, { "epoch": 0.6115903975993998, "grad_norm": 0.8781305551528931, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9783, "tokens_per_second_per_gpu": 10692.05, "total_tokens": 965954674 }, { "epoch": 0.6116529132283071, "grad_norm": 0.885161817073822, "learning_rate": 2e-05, "loss": 0.5908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9784, "tokens_per_second_per_gpu": 11166.39, "total_tokens": 966052817 }, { "epoch": 0.6117154288572143, "grad_norm": 0.8390251398086548, "learning_rate": 2e-05, "loss": 0.6815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9785, "tokens_per_second_per_gpu": 11604.04, "total_tokens": 966160124 }, { "epoch": 0.6117779444861215, "grad_norm": 0.8649470806121826, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9786, "tokens_per_second_per_gpu": 10773.81, "total_tokens": 966258395 }, { "epoch": 0.6118404601150288, "grad_norm": 0.8554702997207642, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9787, "tokens_per_second_per_gpu": 10839.19, "total_tokens": 966360646 }, { "epoch": 0.6119029757439359, "grad_norm": 1.132239818572998, "learning_rate": 2e-05, "loss": 0.6706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9788, "tokens_per_second_per_gpu": 10557.17, "total_tokens": 966459253 }, { "epoch": 0.6119654913728432, "grad_norm": 0.8826273083686829, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9789, "tokens_per_second_per_gpu": 10622.64, "total_tokens": 966558691 }, { "epoch": 0.6120280070017504, "grad_norm": 0.8648585081100464, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9790, "tokens_per_second_per_gpu": 11104.37, "total_tokens": 966661835 }, { "epoch": 0.6120905226306577, "grad_norm": 0.9149312376976013, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9791, "tokens_per_second_per_gpu": 10467.13, "total_tokens": 966759877 }, { "epoch": 0.6121530382595649, "grad_norm": 0.9088026881217957, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9792, "tokens_per_second_per_gpu": 10275.45, "total_tokens": 966857468 }, { "epoch": 0.6122155538884722, "grad_norm": 0.9001367688179016, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9793, "tokens_per_second_per_gpu": 10520.76, "total_tokens": 966954151 }, { "epoch": 0.6122780695173794, "grad_norm": 0.9186236262321472, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9794, "tokens_per_second_per_gpu": 9877.93, "total_tokens": 967047769 }, { "epoch": 0.6123405851462865, "grad_norm": 0.841953456401825, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9795, "tokens_per_second_per_gpu": 10701.95, "total_tokens": 967150044 }, { "epoch": 0.6124031007751938, "grad_norm": 0.8759480714797974, "learning_rate": 2e-05, "loss": 0.6325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9796, "tokens_per_second_per_gpu": 10976.2, "total_tokens": 967252366 }, { "epoch": 0.612465616404101, "grad_norm": 0.925814151763916, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9797, "tokens_per_second_per_gpu": 10245.62, "total_tokens": 967350621 }, { "epoch": 0.6125281320330083, "grad_norm": 0.9137313961982727, "learning_rate": 2e-05, "loss": 0.6297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9798, "tokens_per_second_per_gpu": 10125.87, "total_tokens": 967450888 }, { "epoch": 0.6125906476619155, "grad_norm": 0.8647183775901794, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9799, "tokens_per_second_per_gpu": 10718.68, "total_tokens": 967552114 }, { "epoch": 0.6126531632908228, "grad_norm": 0.8586437106132507, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9800, "tokens_per_second_per_gpu": 11111.97, "total_tokens": 967648976 }, { "epoch": 0.6127156789197299, "grad_norm": 0.8673313856124878, "learning_rate": 2e-05, "loss": 0.597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9801, "tokens_per_second_per_gpu": 11276.5, "total_tokens": 967747615 }, { "epoch": 0.6127781945486371, "grad_norm": 0.8538737297058105, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9802, "tokens_per_second_per_gpu": 10600.25, "total_tokens": 967847314 }, { "epoch": 0.6128407101775444, "grad_norm": 0.8564271330833435, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9803, "tokens_per_second_per_gpu": 11526.31, "total_tokens": 967951761 }, { "epoch": 0.6129032258064516, "grad_norm": 0.8568373322486877, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9804, "tokens_per_second_per_gpu": 10446.41, "total_tokens": 968053332 }, { "epoch": 0.6129657414353589, "grad_norm": 0.9076757431030273, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9805, "tokens_per_second_per_gpu": 10856.79, "total_tokens": 968151912 }, { "epoch": 0.6130282570642661, "grad_norm": 0.8855800628662109, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9806, "tokens_per_second_per_gpu": 10844.25, "total_tokens": 968254880 }, { "epoch": 0.6130907726931732, "grad_norm": 0.8892872929573059, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9807, "tokens_per_second_per_gpu": 10006.22, "total_tokens": 968352595 }, { "epoch": 0.6131532883220805, "grad_norm": 0.8894641399383545, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9808, "tokens_per_second_per_gpu": 10353.45, "total_tokens": 968451364 }, { "epoch": 0.6132158039509877, "grad_norm": 0.8977994322776794, "learning_rate": 2e-05, "loss": 0.5791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9809, "tokens_per_second_per_gpu": 9923.06, "total_tokens": 968542347 }, { "epoch": 0.613278319579895, "grad_norm": 0.8679284453392029, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9810, "tokens_per_second_per_gpu": 10950.15, "total_tokens": 968636145 }, { "epoch": 0.6133408352088022, "grad_norm": 0.839898407459259, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9811, "tokens_per_second_per_gpu": 10396.67, "total_tokens": 968737948 }, { "epoch": 0.6134033508377095, "grad_norm": 0.874167263507843, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9812, "tokens_per_second_per_gpu": 10663.5, "total_tokens": 968835759 }, { "epoch": 0.6134658664666166, "grad_norm": 0.8601965308189392, "learning_rate": 2e-05, "loss": 0.5964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9813, "tokens_per_second_per_gpu": 10436.99, "total_tokens": 968933793 }, { "epoch": 0.6135283820955238, "grad_norm": 0.8959999084472656, "learning_rate": 2e-05, "loss": 0.6812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9814, "tokens_per_second_per_gpu": 9893.99, "total_tokens": 969031201 }, { "epoch": 0.6135908977244311, "grad_norm": 0.8782525658607483, "learning_rate": 2e-05, "loss": 0.6538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9815, "tokens_per_second_per_gpu": 10987.41, "total_tokens": 969132985 }, { "epoch": 0.6136534133533383, "grad_norm": 0.8652355670928955, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9816, "tokens_per_second_per_gpu": 10473.51, "total_tokens": 969230719 }, { "epoch": 0.6137159289822456, "grad_norm": 0.8351831436157227, "learning_rate": 2e-05, "loss": 0.579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9817, "tokens_per_second_per_gpu": 10182.99, "total_tokens": 969328302 }, { "epoch": 0.6137784446111528, "grad_norm": 0.9388365149497986, "learning_rate": 2e-05, "loss": 0.5874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9818, "tokens_per_second_per_gpu": 9611.6, "total_tokens": 969420153 }, { "epoch": 0.61384096024006, "grad_norm": 0.9295251965522766, "learning_rate": 2e-05, "loss": 0.6566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9819, "tokens_per_second_per_gpu": 11036.99, "total_tokens": 969523681 }, { "epoch": 0.6139034758689672, "grad_norm": 0.8866828083992004, "learning_rate": 2e-05, "loss": 0.6237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9820, "tokens_per_second_per_gpu": 10359.25, "total_tokens": 969623774 }, { "epoch": 0.6139659914978745, "grad_norm": 0.9101597666740417, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9821, "tokens_per_second_per_gpu": 10053.44, "total_tokens": 969723006 }, { "epoch": 0.6140285071267817, "grad_norm": 0.9024683833122253, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9822, "tokens_per_second_per_gpu": 10622.77, "total_tokens": 969820914 }, { "epoch": 0.6140910227556889, "grad_norm": 0.8840463757514954, "learning_rate": 2e-05, "loss": 0.6459, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9823, "tokens_per_second_per_gpu": 11062.02, "total_tokens": 969922590 }, { "epoch": 0.6141535383845962, "grad_norm": 0.9113801717758179, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9824, "tokens_per_second_per_gpu": 10587.19, "total_tokens": 970015411 }, { "epoch": 0.6142160540135033, "grad_norm": 0.9072033166885376, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9825, "tokens_per_second_per_gpu": 9608.19, "total_tokens": 970116431 }, { "epoch": 0.6142785696424106, "grad_norm": 0.9146367907524109, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9826, "tokens_per_second_per_gpu": 9622.3, "total_tokens": 970211003 }, { "epoch": 0.6143410852713178, "grad_norm": 0.849608838558197, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9827, "tokens_per_second_per_gpu": 10436.21, "total_tokens": 970314366 }, { "epoch": 0.614403600900225, "grad_norm": 0.8713964223861694, "learning_rate": 2e-05, "loss": 0.5984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9828, "tokens_per_second_per_gpu": 10161.4, "total_tokens": 970410813 }, { "epoch": 0.6144661165291323, "grad_norm": 0.8960294723510742, "learning_rate": 2e-05, "loss": 0.5977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9829, "tokens_per_second_per_gpu": 9971.86, "total_tokens": 970506221 }, { "epoch": 0.6145286321580395, "grad_norm": 0.8702906370162964, "learning_rate": 2e-05, "loss": 0.5981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9830, "tokens_per_second_per_gpu": 10479.17, "total_tokens": 970608166 }, { "epoch": 0.6145911477869468, "grad_norm": 0.8857836127281189, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9831, "tokens_per_second_per_gpu": 10792.26, "total_tokens": 970707160 }, { "epoch": 0.6146536634158539, "grad_norm": 0.8450369834899902, "learning_rate": 2e-05, "loss": 0.5739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9832, "tokens_per_second_per_gpu": 9932.79, "total_tokens": 970804632 }, { "epoch": 0.6147161790447612, "grad_norm": 0.9037669897079468, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9833, "tokens_per_second_per_gpu": 10814.22, "total_tokens": 970904810 }, { "epoch": 0.6147786946736684, "grad_norm": 0.8904596567153931, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9834, "tokens_per_second_per_gpu": 10926.04, "total_tokens": 971003485 }, { "epoch": 0.6148412103025757, "grad_norm": 0.9056563377380371, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9835, "tokens_per_second_per_gpu": 9932.83, "total_tokens": 971095763 }, { "epoch": 0.6149037259314829, "grad_norm": 0.8878365159034729, "learning_rate": 2e-05, "loss": 0.6156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9836, "tokens_per_second_per_gpu": 9749.27, "total_tokens": 971190697 }, { "epoch": 0.6149662415603901, "grad_norm": 0.9037503600120544, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9837, "tokens_per_second_per_gpu": 10564.94, "total_tokens": 971289585 }, { "epoch": 0.6150287571892973, "grad_norm": 0.877985417842865, "learning_rate": 2e-05, "loss": 0.622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9838, "tokens_per_second_per_gpu": 10690.72, "total_tokens": 971391589 }, { "epoch": 0.6150912728182045, "grad_norm": 0.9245865345001221, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9839, "tokens_per_second_per_gpu": 10692.41, "total_tokens": 971484881 }, { "epoch": 0.6151537884471118, "grad_norm": 0.8913196921348572, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9840, "tokens_per_second_per_gpu": 10067.47, "total_tokens": 971584113 }, { "epoch": 0.615216304076019, "grad_norm": 0.9348759055137634, "learning_rate": 2e-05, "loss": 0.6483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9841, "tokens_per_second_per_gpu": 10409.8, "total_tokens": 971682071 }, { "epoch": 0.6152788197049263, "grad_norm": 0.8610477447509766, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9842, "tokens_per_second_per_gpu": 10986.62, "total_tokens": 971780421 }, { "epoch": 0.6153413353338335, "grad_norm": 0.8687574863433838, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9843, "tokens_per_second_per_gpu": 10880.0, "total_tokens": 971883162 }, { "epoch": 0.6154038509627406, "grad_norm": 0.8685698509216309, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9844, "tokens_per_second_per_gpu": 10667.16, "total_tokens": 971982316 }, { "epoch": 0.6154663665916479, "grad_norm": 0.8844584226608276, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9845, "tokens_per_second_per_gpu": 10623.19, "total_tokens": 972083619 }, { "epoch": 0.6155288822205551, "grad_norm": 0.903753936290741, "learning_rate": 2e-05, "loss": 0.5977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9846, "tokens_per_second_per_gpu": 10804.22, "total_tokens": 972179076 }, { "epoch": 0.6155913978494624, "grad_norm": 0.8576764464378357, "learning_rate": 2e-05, "loss": 0.5961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9847, "tokens_per_second_per_gpu": 10832.75, "total_tokens": 972276703 }, { "epoch": 0.6156539134783696, "grad_norm": 0.8782941699028015, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9848, "tokens_per_second_per_gpu": 10719.45, "total_tokens": 972375639 }, { "epoch": 0.6157164291072769, "grad_norm": 0.8584007024765015, "learning_rate": 2e-05, "loss": 0.5713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9849, "tokens_per_second_per_gpu": 9938.98, "total_tokens": 972472932 }, { "epoch": 0.615778944736184, "grad_norm": 0.9033038020133972, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9850, "tokens_per_second_per_gpu": 10433.3, "total_tokens": 972569299 }, { "epoch": 0.6158414603650912, "grad_norm": 0.8748132586479187, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9851, "tokens_per_second_per_gpu": 10590.48, "total_tokens": 972668331 }, { "epoch": 0.6159039759939985, "grad_norm": 0.89408278465271, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9852, "tokens_per_second_per_gpu": 10590.49, "total_tokens": 972769083 }, { "epoch": 0.6159664916229057, "grad_norm": 0.9122580289840698, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9853, "tokens_per_second_per_gpu": 9961.05, "total_tokens": 972869557 }, { "epoch": 0.616029007251813, "grad_norm": 0.880416750907898, "learning_rate": 2e-05, "loss": 0.5994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9854, "tokens_per_second_per_gpu": 10600.3, "total_tokens": 972965277 }, { "epoch": 0.6160915228807202, "grad_norm": 0.9179509878158569, "learning_rate": 2e-05, "loss": 0.6019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9855, "tokens_per_second_per_gpu": 10988.09, "total_tokens": 973060618 }, { "epoch": 0.6161540385096274, "grad_norm": 0.8939794301986694, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9856, "tokens_per_second_per_gpu": 10794.45, "total_tokens": 973157414 }, { "epoch": 0.6162165541385346, "grad_norm": 0.9065475463867188, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9857, "tokens_per_second_per_gpu": 10838.37, "total_tokens": 973252763 }, { "epoch": 0.6162790697674418, "grad_norm": 0.8731662631034851, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9858, "tokens_per_second_per_gpu": 10752.87, "total_tokens": 973353317 }, { "epoch": 0.6163415853963491, "grad_norm": 0.8736754059791565, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9859, "tokens_per_second_per_gpu": 9127.95, "total_tokens": 973447724 }, { "epoch": 0.6164041010252563, "grad_norm": 0.8845568299293518, "learning_rate": 2e-05, "loss": 0.6185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9860, "tokens_per_second_per_gpu": 11165.51, "total_tokens": 973548560 }, { "epoch": 0.6164666166541636, "grad_norm": 0.8769469857215881, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9861, "tokens_per_second_per_gpu": 11096.0, "total_tokens": 973650161 }, { "epoch": 0.6165291322830707, "grad_norm": 0.8599252700805664, "learning_rate": 2e-05, "loss": 0.6481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9862, "tokens_per_second_per_gpu": 11155.57, "total_tokens": 973754048 }, { "epoch": 0.616591647911978, "grad_norm": 0.8930362462997437, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9863, "tokens_per_second_per_gpu": 10274.29, "total_tokens": 973854304 }, { "epoch": 0.6166541635408852, "grad_norm": 0.8704214096069336, "learning_rate": 2e-05, "loss": 0.5964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9864, "tokens_per_second_per_gpu": 9319.43, "total_tokens": 973947760 }, { "epoch": 0.6167166791697924, "grad_norm": 0.9266123175621033, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9865, "tokens_per_second_per_gpu": 9894.97, "total_tokens": 974042939 }, { "epoch": 0.6167791947986997, "grad_norm": 0.9083724617958069, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9866, "tokens_per_second_per_gpu": 10966.14, "total_tokens": 974144533 }, { "epoch": 0.6168417104276069, "grad_norm": 0.9951679706573486, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9867, "tokens_per_second_per_gpu": 10365.29, "total_tokens": 974243756 }, { "epoch": 0.6169042260565142, "grad_norm": 0.8568891286849976, "learning_rate": 2e-05, "loss": 0.6185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9868, "tokens_per_second_per_gpu": 10506.57, "total_tokens": 974345824 }, { "epoch": 0.6169667416854213, "grad_norm": 0.8533965349197388, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9869, "tokens_per_second_per_gpu": 10809.5, "total_tokens": 974449716 }, { "epoch": 0.6170292573143286, "grad_norm": 0.90513676404953, "learning_rate": 2e-05, "loss": 0.6824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9870, "tokens_per_second_per_gpu": 11402.4, "total_tokens": 974551390 }, { "epoch": 0.6170917729432358, "grad_norm": 0.8935580849647522, "learning_rate": 2e-05, "loss": 0.6095, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9871, "tokens_per_second_per_gpu": 10299.54, "total_tokens": 974646009 }, { "epoch": 0.617154288572143, "grad_norm": 0.8674104809761047, "learning_rate": 2e-05, "loss": 0.6014, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9872, "tokens_per_second_per_gpu": 10526.85, "total_tokens": 974744640 }, { "epoch": 0.6172168042010503, "grad_norm": 0.9025633335113525, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9873, "tokens_per_second_per_gpu": 10400.91, "total_tokens": 974844632 }, { "epoch": 0.6172793198299575, "grad_norm": 0.8810765147209167, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9874, "tokens_per_second_per_gpu": 9488.74, "total_tokens": 974940861 }, { "epoch": 0.6173418354588647, "grad_norm": 1.0242221355438232, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9875, "tokens_per_second_per_gpu": 9911.4, "total_tokens": 975037899 }, { "epoch": 0.6174043510877719, "grad_norm": 0.8494796752929688, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9876, "tokens_per_second_per_gpu": 11356.85, "total_tokens": 975140709 }, { "epoch": 0.6174668667166792, "grad_norm": 0.884960412979126, "learning_rate": 2e-05, "loss": 0.6374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9877, "tokens_per_second_per_gpu": 10714.14, "total_tokens": 975238668 }, { "epoch": 0.6175293823455864, "grad_norm": 0.944339394569397, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9878, "tokens_per_second_per_gpu": 11586.0, "total_tokens": 975340361 }, { "epoch": 0.6175918979744937, "grad_norm": 0.8964020609855652, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9879, "tokens_per_second_per_gpu": 9881.92, "total_tokens": 975436486 }, { "epoch": 0.6176544136034009, "grad_norm": 0.9095280766487122, "learning_rate": 2e-05, "loss": 0.7221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9880, "tokens_per_second_per_gpu": 11047.82, "total_tokens": 975538208 }, { "epoch": 0.617716929232308, "grad_norm": 0.8772763013839722, "learning_rate": 2e-05, "loss": 0.6683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9881, "tokens_per_second_per_gpu": 10736.6, "total_tokens": 975642612 }, { "epoch": 0.6177794448612153, "grad_norm": 0.9136253595352173, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9882, "tokens_per_second_per_gpu": 10427.57, "total_tokens": 975740670 }, { "epoch": 0.6178419604901225, "grad_norm": 0.8968483209609985, "learning_rate": 2e-05, "loss": 0.6482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9883, "tokens_per_second_per_gpu": 11061.04, "total_tokens": 975844170 }, { "epoch": 0.6179044761190298, "grad_norm": 0.8871212601661682, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9884, "tokens_per_second_per_gpu": 11088.34, "total_tokens": 975947787 }, { "epoch": 0.617966991747937, "grad_norm": 0.8987064361572266, "learning_rate": 2e-05, "loss": 0.5859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9885, "tokens_per_second_per_gpu": 10734.09, "total_tokens": 976042483 }, { "epoch": 0.6180295073768443, "grad_norm": 0.899416983127594, "learning_rate": 2e-05, "loss": 0.6294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9886, "tokens_per_second_per_gpu": 10493.55, "total_tokens": 976142336 }, { "epoch": 0.6180920230057514, "grad_norm": 0.9017811417579651, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9887, "tokens_per_second_per_gpu": 9951.25, "total_tokens": 976239709 }, { "epoch": 0.6181545386346586, "grad_norm": 0.8835926055908203, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9888, "tokens_per_second_per_gpu": 11096.33, "total_tokens": 976337143 }, { "epoch": 0.6182170542635659, "grad_norm": 0.8836472630500793, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9889, "tokens_per_second_per_gpu": 11144.59, "total_tokens": 976440621 }, { "epoch": 0.6182795698924731, "grad_norm": 0.927499532699585, "learning_rate": 2e-05, "loss": 0.6627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9890, "tokens_per_second_per_gpu": 10269.57, "total_tokens": 976541119 }, { "epoch": 0.6183420855213804, "grad_norm": 0.9080202579498291, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9891, "tokens_per_second_per_gpu": 10368.88, "total_tokens": 976642565 }, { "epoch": 0.6184046011502876, "grad_norm": 0.8718942403793335, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9892, "tokens_per_second_per_gpu": 10843.7, "total_tokens": 976744468 }, { "epoch": 0.6184671167791947, "grad_norm": 0.9073115587234497, "learning_rate": 2e-05, "loss": 0.6045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9893, "tokens_per_second_per_gpu": 10505.84, "total_tokens": 976844264 }, { "epoch": 0.618529632408102, "grad_norm": 0.9376043677330017, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9894, "tokens_per_second_per_gpu": 10251.58, "total_tokens": 976940990 }, { "epoch": 0.6185921480370092, "grad_norm": 0.9391866326332092, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9895, "tokens_per_second_per_gpu": 10892.47, "total_tokens": 977041897 }, { "epoch": 0.6186546636659165, "grad_norm": 0.9128649234771729, "learning_rate": 2e-05, "loss": 0.5947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9896, "tokens_per_second_per_gpu": 11091.77, "total_tokens": 977140069 }, { "epoch": 0.6187171792948237, "grad_norm": 0.9104706048965454, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9897, "tokens_per_second_per_gpu": 10520.18, "total_tokens": 977234899 }, { "epoch": 0.618779694923731, "grad_norm": 0.8571684956550598, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9898, "tokens_per_second_per_gpu": 10684.03, "total_tokens": 977335845 }, { "epoch": 0.6188422105526381, "grad_norm": 0.9588193297386169, "learning_rate": 2e-05, "loss": 0.6581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9899, "tokens_per_second_per_gpu": 10124.37, "total_tokens": 977433541 }, { "epoch": 0.6189047261815454, "grad_norm": 0.9010825157165527, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9900, "tokens_per_second_per_gpu": 10676.2, "total_tokens": 977533254 }, { "epoch": 0.6189672418104526, "grad_norm": 0.8820099830627441, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9901, "tokens_per_second_per_gpu": 10447.21, "total_tokens": 977628237 }, { "epoch": 0.6190297574393598, "grad_norm": 0.8991073966026306, "learning_rate": 2e-05, "loss": 0.6581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9902, "tokens_per_second_per_gpu": 11036.05, "total_tokens": 977730110 }, { "epoch": 0.6190922730682671, "grad_norm": 0.8558743596076965, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9903, "tokens_per_second_per_gpu": 11494.85, "total_tokens": 977836058 }, { "epoch": 0.6191547886971743, "grad_norm": 0.9021973609924316, "learning_rate": 2e-05, "loss": 0.6461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9904, "tokens_per_second_per_gpu": 10551.23, "total_tokens": 977935110 }, { "epoch": 0.6192173043260816, "grad_norm": 0.8560947179794312, "learning_rate": 2e-05, "loss": 0.6185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9905, "tokens_per_second_per_gpu": 10881.92, "total_tokens": 978038338 }, { "epoch": 0.6192798199549887, "grad_norm": 0.8879972696304321, "learning_rate": 2e-05, "loss": 0.6602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9906, "tokens_per_second_per_gpu": 11050.21, "total_tokens": 978139321 }, { "epoch": 0.619342335583896, "grad_norm": 0.8913711905479431, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9907, "tokens_per_second_per_gpu": 10068.84, "total_tokens": 978235832 }, { "epoch": 0.6194048512128032, "grad_norm": 0.9447364211082458, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9908, "tokens_per_second_per_gpu": 10156.22, "total_tokens": 978332365 }, { "epoch": 0.6194673668417104, "grad_norm": 0.9574086666107178, "learning_rate": 2e-05, "loss": 0.65, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9909, "tokens_per_second_per_gpu": 10655.31, "total_tokens": 978434306 }, { "epoch": 0.6195298824706177, "grad_norm": 0.8650336861610413, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9910, "tokens_per_second_per_gpu": 11304.37, "total_tokens": 978535919 }, { "epoch": 0.6195923980995249, "grad_norm": 0.8678915500640869, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9911, "tokens_per_second_per_gpu": 10779.28, "total_tokens": 978635181 }, { "epoch": 0.6196549137284321, "grad_norm": 0.9111414551734924, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9912, "tokens_per_second_per_gpu": 9255.83, "total_tokens": 978730275 }, { "epoch": 0.6197174293573393, "grad_norm": 0.876667857170105, "learning_rate": 2e-05, "loss": 0.5794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9913, "tokens_per_second_per_gpu": 10587.05, "total_tokens": 978829957 }, { "epoch": 0.6197799449862466, "grad_norm": 0.8841650485992432, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9914, "tokens_per_second_per_gpu": 10606.41, "total_tokens": 978928734 }, { "epoch": 0.6198424606151538, "grad_norm": 0.9087359309196472, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9915, "tokens_per_second_per_gpu": 10519.01, "total_tokens": 979028482 }, { "epoch": 0.619904976244061, "grad_norm": 0.9035404920578003, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9916, "tokens_per_second_per_gpu": 10370.29, "total_tokens": 979127821 }, { "epoch": 0.6199674918729683, "grad_norm": 0.8919773697853088, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9917, "tokens_per_second_per_gpu": 10965.23, "total_tokens": 979233734 }, { "epoch": 0.6200300075018754, "grad_norm": 0.8974595665931702, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9918, "tokens_per_second_per_gpu": 10804.76, "total_tokens": 979334121 }, { "epoch": 0.6200925231307827, "grad_norm": 0.8512986898422241, "learning_rate": 2e-05, "loss": 0.6146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9919, "tokens_per_second_per_gpu": 10288.28, "total_tokens": 979432263 }, { "epoch": 0.6201550387596899, "grad_norm": 1.1193383932113647, "learning_rate": 2e-05, "loss": 0.6596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9920, "tokens_per_second_per_gpu": 10857.42, "total_tokens": 979537344 }, { "epoch": 0.6202175543885972, "grad_norm": 0.9179471135139465, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9921, "tokens_per_second_per_gpu": 10409.22, "total_tokens": 979635559 }, { "epoch": 0.6202800700175044, "grad_norm": 0.8903002142906189, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9922, "tokens_per_second_per_gpu": 11197.86, "total_tokens": 979735313 }, { "epoch": 0.6203425856464116, "grad_norm": 0.93846195936203, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9923, "tokens_per_second_per_gpu": 10072.22, "total_tokens": 979831069 }, { "epoch": 0.6204051012753188, "grad_norm": 0.849493682384491, "learning_rate": 2e-05, "loss": 0.6085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9924, "tokens_per_second_per_gpu": 9964.99, "total_tokens": 979930591 }, { "epoch": 0.620467616904226, "grad_norm": 0.9667713046073914, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9925, "tokens_per_second_per_gpu": 10907.05, "total_tokens": 980034196 }, { "epoch": 0.6205301325331333, "grad_norm": 0.8537660837173462, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9926, "tokens_per_second_per_gpu": 10977.01, "total_tokens": 980134541 }, { "epoch": 0.6205926481620405, "grad_norm": 0.8792341351509094, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9927, "tokens_per_second_per_gpu": 11055.19, "total_tokens": 980235482 }, { "epoch": 0.6206551637909478, "grad_norm": 0.8952016234397888, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9928, "tokens_per_second_per_gpu": 10879.61, "total_tokens": 980335191 }, { "epoch": 0.620717679419855, "grad_norm": 0.8762631416320801, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9929, "tokens_per_second_per_gpu": 11288.28, "total_tokens": 980439267 }, { "epoch": 0.6207801950487621, "grad_norm": 0.9247403740882874, "learning_rate": 2e-05, "loss": 0.6479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9930, "tokens_per_second_per_gpu": 10545.35, "total_tokens": 980537788 }, { "epoch": 0.6208427106776694, "grad_norm": 0.8657614588737488, "learning_rate": 2e-05, "loss": 0.5747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9931, "tokens_per_second_per_gpu": 10833.59, "total_tokens": 980635968 }, { "epoch": 0.6209052263065766, "grad_norm": 0.9113485813140869, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9932, "tokens_per_second_per_gpu": 10106.42, "total_tokens": 980727242 }, { "epoch": 0.6209677419354839, "grad_norm": 0.8827416300773621, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9933, "tokens_per_second_per_gpu": 9995.7, "total_tokens": 980826285 }, { "epoch": 0.6210302575643911, "grad_norm": 0.8923988342285156, "learning_rate": 2e-05, "loss": 0.6706, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9934, "tokens_per_second_per_gpu": 11086.9, "total_tokens": 980926648 }, { "epoch": 0.6210927731932984, "grad_norm": 0.9075294733047485, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9935, "tokens_per_second_per_gpu": 10951.78, "total_tokens": 981023506 }, { "epoch": 0.6211552888222055, "grad_norm": 0.8536057472229004, "learning_rate": 2e-05, "loss": 0.6053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9936, "tokens_per_second_per_gpu": 10300.37, "total_tokens": 981123433 }, { "epoch": 0.6212178044511127, "grad_norm": 0.8994846343994141, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9937, "tokens_per_second_per_gpu": 10615.83, "total_tokens": 981223362 }, { "epoch": 0.62128032008002, "grad_norm": 0.876002311706543, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9938, "tokens_per_second_per_gpu": 10291.82, "total_tokens": 981323903 }, { "epoch": 0.6213428357089272, "grad_norm": 0.8777185082435608, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9939, "tokens_per_second_per_gpu": 11184.22, "total_tokens": 981423121 }, { "epoch": 0.6214053513378345, "grad_norm": 0.881783664226532, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9940, "tokens_per_second_per_gpu": 10664.82, "total_tokens": 981522366 }, { "epoch": 0.6214678669667417, "grad_norm": 0.887507975101471, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9941, "tokens_per_second_per_gpu": 9921.9, "total_tokens": 981620045 }, { "epoch": 0.621530382595649, "grad_norm": 0.8999144434928894, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9942, "tokens_per_second_per_gpu": 10177.77, "total_tokens": 981718014 }, { "epoch": 0.6215928982245561, "grad_norm": 0.8891124725341797, "learning_rate": 2e-05, "loss": 0.6535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9943, "tokens_per_second_per_gpu": 10511.69, "total_tokens": 981820867 }, { "epoch": 0.6216554138534633, "grad_norm": 0.8769607543945312, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9944, "tokens_per_second_per_gpu": 11002.51, "total_tokens": 981919819 }, { "epoch": 0.6217179294823706, "grad_norm": 0.908473551273346, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9945, "tokens_per_second_per_gpu": 10987.31, "total_tokens": 982019562 }, { "epoch": 0.6217804451112778, "grad_norm": 0.9237586259841919, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9946, "tokens_per_second_per_gpu": 10986.0, "total_tokens": 982116076 }, { "epoch": 0.6218429607401851, "grad_norm": 0.8744648694992065, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9947, "tokens_per_second_per_gpu": 10552.43, "total_tokens": 982216930 }, { "epoch": 0.6219054763690923, "grad_norm": 0.8846673965454102, "learning_rate": 2e-05, "loss": 0.5733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9948, "tokens_per_second_per_gpu": 9392.39, "total_tokens": 982308887 }, { "epoch": 0.6219679919979995, "grad_norm": 0.9110782742500305, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9949, "tokens_per_second_per_gpu": 10985.59, "total_tokens": 982411222 }, { "epoch": 0.6220305076269067, "grad_norm": 0.9347003698348999, "learning_rate": 2e-05, "loss": 0.6616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9950, "tokens_per_second_per_gpu": 10758.02, "total_tokens": 982507570 }, { "epoch": 0.622093023255814, "grad_norm": 0.8918219208717346, "learning_rate": 2e-05, "loss": 0.6052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9951, "tokens_per_second_per_gpu": 9984.73, "total_tokens": 982599050 }, { "epoch": 0.6221555388847212, "grad_norm": 0.9022554159164429, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9952, "tokens_per_second_per_gpu": 10543.58, "total_tokens": 982697037 }, { "epoch": 0.6222180545136284, "grad_norm": 0.8955296277999878, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9953, "tokens_per_second_per_gpu": 9981.14, "total_tokens": 982792920 }, { "epoch": 0.6222805701425357, "grad_norm": 0.851600706577301, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9954, "tokens_per_second_per_gpu": 10672.21, "total_tokens": 982893018 }, { "epoch": 0.6223430857714428, "grad_norm": 0.900229275226593, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9955, "tokens_per_second_per_gpu": 10361.93, "total_tokens": 982993870 }, { "epoch": 0.6224056014003501, "grad_norm": 0.8973135352134705, "learning_rate": 2e-05, "loss": 0.5989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9956, "tokens_per_second_per_gpu": 10104.1, "total_tokens": 983089555 }, { "epoch": 0.6224681170292573, "grad_norm": 0.9132903814315796, "learning_rate": 2e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9957, "tokens_per_second_per_gpu": 11384.58, "total_tokens": 983190315 }, { "epoch": 0.6225306326581646, "grad_norm": 0.8904170393943787, "learning_rate": 2e-05, "loss": 0.5975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9958, "tokens_per_second_per_gpu": 10791.5, "total_tokens": 983288772 }, { "epoch": 0.6225931482870718, "grad_norm": 0.838005542755127, "learning_rate": 2e-05, "loss": 0.589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9959, "tokens_per_second_per_gpu": 10599.57, "total_tokens": 983386831 }, { "epoch": 0.622655663915979, "grad_norm": 0.8947557806968689, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9960, "tokens_per_second_per_gpu": 9891.66, "total_tokens": 983481366 }, { "epoch": 0.6227181795448862, "grad_norm": 0.9939214587211609, "learning_rate": 2e-05, "loss": 0.5972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9961, "tokens_per_second_per_gpu": 10200.94, "total_tokens": 983578937 }, { "epoch": 0.6227806951737934, "grad_norm": 0.9313402771949768, "learning_rate": 2e-05, "loss": 0.6478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9962, "tokens_per_second_per_gpu": 10662.55, "total_tokens": 983678459 }, { "epoch": 0.6228432108027007, "grad_norm": 0.8853929042816162, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9963, "tokens_per_second_per_gpu": 10842.39, "total_tokens": 983774780 }, { "epoch": 0.6229057264316079, "grad_norm": 0.941253125667572, "learning_rate": 2e-05, "loss": 0.6479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9964, "tokens_per_second_per_gpu": 10860.12, "total_tokens": 983877064 }, { "epoch": 0.6229682420605152, "grad_norm": 0.8508719205856323, "learning_rate": 2e-05, "loss": 0.6016, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9965, "tokens_per_second_per_gpu": 10754.88, "total_tokens": 983976308 }, { "epoch": 0.6230307576894224, "grad_norm": 0.9419757127761841, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9966, "tokens_per_second_per_gpu": 10580.59, "total_tokens": 984073642 }, { "epoch": 0.6230932733183295, "grad_norm": 0.9177042245864868, "learning_rate": 2e-05, "loss": 0.5735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9967, "tokens_per_second_per_gpu": 10358.6, "total_tokens": 984169368 }, { "epoch": 0.6231557889472368, "grad_norm": 0.8970114588737488, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9968, "tokens_per_second_per_gpu": 10719.63, "total_tokens": 984270814 }, { "epoch": 0.623218304576144, "grad_norm": 0.9072692394256592, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9969, "tokens_per_second_per_gpu": 11035.2, "total_tokens": 984368814 }, { "epoch": 0.6232808202050513, "grad_norm": 0.9036104679107666, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9970, "tokens_per_second_per_gpu": 10123.73, "total_tokens": 984465062 }, { "epoch": 0.6233433358339585, "grad_norm": 0.8743268847465515, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9971, "tokens_per_second_per_gpu": 10807.38, "total_tokens": 984563807 }, { "epoch": 0.6234058514628658, "grad_norm": 0.8700389862060547, "learning_rate": 2e-05, "loss": 0.6036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9972, "tokens_per_second_per_gpu": 11247.47, "total_tokens": 984664984 }, { "epoch": 0.6234683670917729, "grad_norm": 0.9357583522796631, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9973, "tokens_per_second_per_gpu": 10151.54, "total_tokens": 984760216 }, { "epoch": 0.6235308827206801, "grad_norm": 0.9047724008560181, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9974, "tokens_per_second_per_gpu": 11280.0, "total_tokens": 984862923 }, { "epoch": 0.6235933983495874, "grad_norm": 0.8819137215614319, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9975, "tokens_per_second_per_gpu": 11019.51, "total_tokens": 984962483 }, { "epoch": 0.6236559139784946, "grad_norm": 0.8820326924324036, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9976, "tokens_per_second_per_gpu": 11263.24, "total_tokens": 985065600 }, { "epoch": 0.6237184296074019, "grad_norm": 0.8680112361907959, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9977, "tokens_per_second_per_gpu": 10928.55, "total_tokens": 985166709 }, { "epoch": 0.6237809452363091, "grad_norm": 0.893480122089386, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9978, "tokens_per_second_per_gpu": 9499.67, "total_tokens": 985261692 }, { "epoch": 0.6238434608652164, "grad_norm": 0.8696945309638977, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9979, "tokens_per_second_per_gpu": 10566.92, "total_tokens": 985363462 }, { "epoch": 0.6239059764941235, "grad_norm": 0.9217736124992371, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9980, "tokens_per_second_per_gpu": 10608.98, "total_tokens": 985462780 }, { "epoch": 0.6239684921230307, "grad_norm": 0.8938741683959961, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9981, "tokens_per_second_per_gpu": 10318.02, "total_tokens": 985565231 }, { "epoch": 0.624031007751938, "grad_norm": 0.8752861022949219, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9982, "tokens_per_second_per_gpu": 15856.3, "total_tokens": 985665406 }, { "epoch": 0.6240935233808452, "grad_norm": 0.8648839592933655, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9983, "tokens_per_second_per_gpu": 10500.24, "total_tokens": 985767261 }, { "epoch": 0.6241560390097525, "grad_norm": 0.9130885601043701, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9984, "tokens_per_second_per_gpu": 10428.09, "total_tokens": 985865666 }, { "epoch": 0.6242185546386597, "grad_norm": 0.9000247120857239, "learning_rate": 2e-05, "loss": 0.6741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9985, "tokens_per_second_per_gpu": 10217.06, "total_tokens": 985967728 }, { "epoch": 0.6242810702675669, "grad_norm": 0.8890751004219055, "learning_rate": 2e-05, "loss": 0.6089, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9986, "tokens_per_second_per_gpu": 10325.74, "total_tokens": 986063179 }, { "epoch": 0.6243435858964741, "grad_norm": 0.8715104460716248, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9987, "tokens_per_second_per_gpu": 11116.87, "total_tokens": 986162092 }, { "epoch": 0.6244061015253813, "grad_norm": 0.9174119830131531, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9988, "tokens_per_second_per_gpu": 10347.01, "total_tokens": 986260580 }, { "epoch": 0.6244686171542886, "grad_norm": 0.879021942615509, "learning_rate": 2e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9989, "tokens_per_second_per_gpu": 9912.01, "total_tokens": 986355324 }, { "epoch": 0.6245311327831958, "grad_norm": 0.882802426815033, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9990, "tokens_per_second_per_gpu": 9742.66, "total_tokens": 986452294 }, { "epoch": 0.6245936484121031, "grad_norm": 0.9104074239730835, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9991, "tokens_per_second_per_gpu": 10230.62, "total_tokens": 986545118 }, { "epoch": 0.6246561640410102, "grad_norm": 0.8871777653694153, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9992, "tokens_per_second_per_gpu": 9729.44, "total_tokens": 986641117 }, { "epoch": 0.6247186796699175, "grad_norm": 0.857362687587738, "learning_rate": 2e-05, "loss": 0.6126, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9993, "tokens_per_second_per_gpu": 10195.83, "total_tokens": 986741984 }, { "epoch": 0.6247811952988247, "grad_norm": 0.9304455518722534, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9994, "tokens_per_second_per_gpu": 10298.71, "total_tokens": 986834894 }, { "epoch": 0.6248437109277319, "grad_norm": 0.9017961025238037, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9995, "tokens_per_second_per_gpu": 10881.79, "total_tokens": 986934797 }, { "epoch": 0.6249062265566392, "grad_norm": 0.9225963354110718, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9996, "tokens_per_second_per_gpu": 10419.69, "total_tokens": 987031522 }, { "epoch": 0.6249687421855464, "grad_norm": 0.8880707025527954, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9997, "tokens_per_second_per_gpu": 10026.03, "total_tokens": 987128793 }, { "epoch": 0.6250312578144536, "grad_norm": 0.8685660362243652, "learning_rate": 2e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9998, "tokens_per_second_per_gpu": 10343.54, "total_tokens": 987227411 }, { "epoch": 0.6250937734433608, "grad_norm": 0.9144018292427063, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 9999, "tokens_per_second_per_gpu": 9745.29, "total_tokens": 987324008 }, { "epoch": 0.6251562890722681, "grad_norm": 0.9259751439094543, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10000, "tokens_per_second_per_gpu": 10928.09, "total_tokens": 987418658 }, { "epoch": 0.6252188047011753, "grad_norm": 0.8682181239128113, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10001, "tokens_per_second_per_gpu": 9791.4, "total_tokens": 987518889 }, { "epoch": 0.6252813203300825, "grad_norm": 0.9058620929718018, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10002, "tokens_per_second_per_gpu": 10933.83, "total_tokens": 987619920 }, { "epoch": 0.6253438359589898, "grad_norm": 0.8483473062515259, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10003, "tokens_per_second_per_gpu": 10903.77, "total_tokens": 987719862 }, { "epoch": 0.6254063515878969, "grad_norm": 0.8993352055549622, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10004, "tokens_per_second_per_gpu": 10488.73, "total_tokens": 987817905 }, { "epoch": 0.6254688672168042, "grad_norm": 0.8794964551925659, "learning_rate": 2e-05, "loss": 0.596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10005, "tokens_per_second_per_gpu": 10489.33, "total_tokens": 987915037 }, { "epoch": 0.6255313828457114, "grad_norm": 0.9235408902168274, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10006, "tokens_per_second_per_gpu": 10559.77, "total_tokens": 988010798 }, { "epoch": 0.6255938984746187, "grad_norm": 0.9112551212310791, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10007, "tokens_per_second_per_gpu": 10022.47, "total_tokens": 988107153 }, { "epoch": 0.6256564141035259, "grad_norm": 0.878998875617981, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10008, "tokens_per_second_per_gpu": 11019.98, "total_tokens": 988205759 }, { "epoch": 0.6257189297324331, "grad_norm": 0.9075168371200562, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10009, "tokens_per_second_per_gpu": 9733.85, "total_tokens": 988305523 }, { "epoch": 0.6257814453613403, "grad_norm": 0.8979536890983582, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10010, "tokens_per_second_per_gpu": 9407.51, "total_tokens": 988402459 }, { "epoch": 0.6258439609902475, "grad_norm": 0.864797055721283, "learning_rate": 2e-05, "loss": 0.6496, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10011, "tokens_per_second_per_gpu": 11142.63, "total_tokens": 988502624 }, { "epoch": 0.6259064766191548, "grad_norm": 0.9239476919174194, "learning_rate": 2e-05, "loss": 0.6662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10012, "tokens_per_second_per_gpu": 10557.04, "total_tokens": 988601754 }, { "epoch": 0.625968992248062, "grad_norm": 0.9310629367828369, "learning_rate": 2e-05, "loss": 0.622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10013, "tokens_per_second_per_gpu": 10841.98, "total_tokens": 988698115 }, { "epoch": 0.6260315078769693, "grad_norm": 0.896422266960144, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10014, "tokens_per_second_per_gpu": 10611.89, "total_tokens": 988795997 }, { "epoch": 0.6260940235058765, "grad_norm": 0.9496192932128906, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10015, "tokens_per_second_per_gpu": 9834.47, "total_tokens": 988890925 }, { "epoch": 0.6261565391347836, "grad_norm": 0.8922857642173767, "learning_rate": 2e-05, "loss": 0.6578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10016, "tokens_per_second_per_gpu": 11106.19, "total_tokens": 988996043 }, { "epoch": 0.6262190547636909, "grad_norm": 0.8930054903030396, "learning_rate": 2e-05, "loss": 0.5896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10017, "tokens_per_second_per_gpu": 10456.79, "total_tokens": 989094937 }, { "epoch": 0.6262815703925981, "grad_norm": 0.9031338691711426, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10018, "tokens_per_second_per_gpu": 10269.9, "total_tokens": 989191937 }, { "epoch": 0.6263440860215054, "grad_norm": 0.8562846779823303, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10019, "tokens_per_second_per_gpu": 11016.43, "total_tokens": 989295540 }, { "epoch": 0.6264066016504126, "grad_norm": 0.8659762740135193, "learning_rate": 2e-05, "loss": 0.5796, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10020, "tokens_per_second_per_gpu": 10624.04, "total_tokens": 989394590 }, { "epoch": 0.6264691172793199, "grad_norm": 0.8668360710144043, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10021, "tokens_per_second_per_gpu": 11174.85, "total_tokens": 989496529 }, { "epoch": 0.6265316329082271, "grad_norm": 0.8855212926864624, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10022, "tokens_per_second_per_gpu": 9761.49, "total_tokens": 989594592 }, { "epoch": 0.6265941485371342, "grad_norm": 0.8747006058692932, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10023, "tokens_per_second_per_gpu": 10365.93, "total_tokens": 989691839 }, { "epoch": 0.6266566641660415, "grad_norm": 0.8502959609031677, "learning_rate": 2e-05, "loss": 0.6049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10024, "tokens_per_second_per_gpu": 10548.1, "total_tokens": 989794745 }, { "epoch": 0.6267191797949487, "grad_norm": 0.8994104862213135, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10025, "tokens_per_second_per_gpu": 10364.86, "total_tokens": 989896231 }, { "epoch": 0.626781695423856, "grad_norm": 0.86329185962677, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10026, "tokens_per_second_per_gpu": 10197.51, "total_tokens": 989996284 }, { "epoch": 0.6268442110527632, "grad_norm": 0.8568360805511475, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10027, "tokens_per_second_per_gpu": 10400.16, "total_tokens": 990093759 }, { "epoch": 0.6269067266816705, "grad_norm": 0.8722648620605469, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10028, "tokens_per_second_per_gpu": 10828.33, "total_tokens": 990194349 }, { "epoch": 0.6269692423105776, "grad_norm": 0.8943681120872498, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10029, "tokens_per_second_per_gpu": 10071.26, "total_tokens": 990291436 }, { "epoch": 0.6270317579394848, "grad_norm": 0.8845461010932922, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10030, "tokens_per_second_per_gpu": 10599.49, "total_tokens": 990391701 }, { "epoch": 0.6270942735683921, "grad_norm": 0.906009316444397, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10031, "tokens_per_second_per_gpu": 11032.55, "total_tokens": 990492061 }, { "epoch": 0.6271567891972993, "grad_norm": 0.8880152702331543, "learning_rate": 2e-05, "loss": 0.6657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10032, "tokens_per_second_per_gpu": 11433.73, "total_tokens": 990598552 }, { "epoch": 0.6272193048262066, "grad_norm": 0.874498724937439, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10033, "tokens_per_second_per_gpu": 10529.51, "total_tokens": 990697907 }, { "epoch": 0.6272818204551138, "grad_norm": 0.8799840807914734, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10034, "tokens_per_second_per_gpu": 10598.08, "total_tokens": 990793117 }, { "epoch": 0.627344336084021, "grad_norm": 0.9578828811645508, "learning_rate": 2e-05, "loss": 0.596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10035, "tokens_per_second_per_gpu": 10701.25, "total_tokens": 990891096 }, { "epoch": 0.6274068517129282, "grad_norm": 0.847515881061554, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10036, "tokens_per_second_per_gpu": 11307.76, "total_tokens": 990995267 }, { "epoch": 0.6274693673418354, "grad_norm": 0.8695659637451172, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10037, "tokens_per_second_per_gpu": 11300.35, "total_tokens": 991096366 }, { "epoch": 0.6275318829707427, "grad_norm": 0.8957040309906006, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10038, "tokens_per_second_per_gpu": 10394.67, "total_tokens": 991195307 }, { "epoch": 0.6275943985996499, "grad_norm": 0.9041759967803955, "learning_rate": 2e-05, "loss": 0.6499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10039, "tokens_per_second_per_gpu": 10614.13, "total_tokens": 991296456 }, { "epoch": 0.6276569142285572, "grad_norm": 0.8931875228881836, "learning_rate": 2e-05, "loss": 0.6614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10040, "tokens_per_second_per_gpu": 10496.63, "total_tokens": 991397053 }, { "epoch": 0.6277194298574643, "grad_norm": 0.9186697006225586, "learning_rate": 2e-05, "loss": 0.6479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10041, "tokens_per_second_per_gpu": 10090.97, "total_tokens": 991490064 }, { "epoch": 0.6277819454863716, "grad_norm": 0.8560078144073486, "learning_rate": 2e-05, "loss": 0.5941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10042, "tokens_per_second_per_gpu": 10771.38, "total_tokens": 991587122 }, { "epoch": 0.6278444611152788, "grad_norm": 0.8960086107254028, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10043, "tokens_per_second_per_gpu": 10631.0, "total_tokens": 991686022 }, { "epoch": 0.627906976744186, "grad_norm": 0.8785136342048645, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10044, "tokens_per_second_per_gpu": 10541.99, "total_tokens": 991784191 }, { "epoch": 0.6279694923730933, "grad_norm": 0.8738837838172913, "learning_rate": 2e-05, "loss": 0.6423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10045, "tokens_per_second_per_gpu": 11030.33, "total_tokens": 991885269 }, { "epoch": 0.6280320080020005, "grad_norm": 0.9039458632469177, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10046, "tokens_per_second_per_gpu": 10235.06, "total_tokens": 991986123 }, { "epoch": 0.6280945236309077, "grad_norm": 0.8986523151397705, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10047, "tokens_per_second_per_gpu": 10426.54, "total_tokens": 992082372 }, { "epoch": 0.6281570392598149, "grad_norm": 0.91213059425354, "learning_rate": 2e-05, "loss": 0.6593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10048, "tokens_per_second_per_gpu": 11238.9, "total_tokens": 992185958 }, { "epoch": 0.6282195548887222, "grad_norm": 0.8740804195404053, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10049, "tokens_per_second_per_gpu": 11138.36, "total_tokens": 992287087 }, { "epoch": 0.6282820705176294, "grad_norm": 0.8977333903312683, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10050, "tokens_per_second_per_gpu": 10072.72, "total_tokens": 992384012 }, { "epoch": 0.6283445861465367, "grad_norm": 0.8748984336853027, "learning_rate": 2e-05, "loss": 0.5731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10051, "tokens_per_second_per_gpu": 10346.78, "total_tokens": 992477769 }, { "epoch": 0.6284071017754439, "grad_norm": 0.901285707950592, "learning_rate": 2e-05, "loss": 0.6413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10052, "tokens_per_second_per_gpu": 9754.44, "total_tokens": 992575689 }, { "epoch": 0.628469617404351, "grad_norm": 0.8941757082939148, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10053, "tokens_per_second_per_gpu": 10933.03, "total_tokens": 992676541 }, { "epoch": 0.6285321330332583, "grad_norm": 0.90824294090271, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10054, "tokens_per_second_per_gpu": 9905.63, "total_tokens": 992774367 }, { "epoch": 0.6285946486621655, "grad_norm": 0.865159273147583, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10055, "tokens_per_second_per_gpu": 10520.42, "total_tokens": 992876265 }, { "epoch": 0.6286571642910728, "grad_norm": 0.8641396760940552, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10056, "tokens_per_second_per_gpu": 11071.18, "total_tokens": 992977502 }, { "epoch": 0.62871967991998, "grad_norm": 0.9548392295837402, "learning_rate": 2e-05, "loss": 0.7162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10057, "tokens_per_second_per_gpu": 9879.4, "total_tokens": 993072158 }, { "epoch": 0.6287821955488873, "grad_norm": 0.896414041519165, "learning_rate": 2e-05, "loss": 0.6118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10058, "tokens_per_second_per_gpu": 10324.02, "total_tokens": 993166615 }, { "epoch": 0.6288447111777945, "grad_norm": 0.8933358788490295, "learning_rate": 2e-05, "loss": 0.6679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10059, "tokens_per_second_per_gpu": 10865.97, "total_tokens": 993269128 }, { "epoch": 0.6289072268067016, "grad_norm": 0.901603102684021, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10060, "tokens_per_second_per_gpu": 10367.11, "total_tokens": 993369792 }, { "epoch": 0.6289697424356089, "grad_norm": 0.9582881331443787, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10061, "tokens_per_second_per_gpu": 10760.6, "total_tokens": 993467534 }, { "epoch": 0.6290322580645161, "grad_norm": 0.875295877456665, "learning_rate": 2e-05, "loss": 0.6394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10062, "tokens_per_second_per_gpu": 10030.13, "total_tokens": 993565462 }, { "epoch": 0.6290947736934234, "grad_norm": 0.8713257312774658, "learning_rate": 2e-05, "loss": 0.597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10063, "tokens_per_second_per_gpu": 9350.73, "total_tokens": 993662957 }, { "epoch": 0.6291572893223306, "grad_norm": 0.8774081468582153, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10064, "tokens_per_second_per_gpu": 11002.55, "total_tokens": 993765600 }, { "epoch": 0.6292198049512379, "grad_norm": 0.9102866649627686, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10065, "tokens_per_second_per_gpu": 10192.09, "total_tokens": 993863345 }, { "epoch": 0.629282320580145, "grad_norm": 0.8728180527687073, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10066, "tokens_per_second_per_gpu": 10810.81, "total_tokens": 993963481 }, { "epoch": 0.6293448362090522, "grad_norm": 0.9193761348724365, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10067, "tokens_per_second_per_gpu": 10562.48, "total_tokens": 994057744 }, { "epoch": 0.6294073518379595, "grad_norm": 0.8806085586547852, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10068, "tokens_per_second_per_gpu": 10576.17, "total_tokens": 994155573 }, { "epoch": 0.6294698674668667, "grad_norm": 0.9072361588478088, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10069, "tokens_per_second_per_gpu": 11034.35, "total_tokens": 994258732 }, { "epoch": 0.629532383095774, "grad_norm": 0.8860622644424438, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10070, "tokens_per_second_per_gpu": 10794.82, "total_tokens": 994361912 }, { "epoch": 0.6295948987246812, "grad_norm": 0.835964024066925, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10071, "tokens_per_second_per_gpu": 11225.47, "total_tokens": 994467057 }, { "epoch": 0.6296574143535884, "grad_norm": 0.8941155076026917, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10072, "tokens_per_second_per_gpu": 10472.27, "total_tokens": 994566163 }, { "epoch": 0.6297199299824956, "grad_norm": 0.8728143572807312, "learning_rate": 2e-05, "loss": 0.5999, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10073, "tokens_per_second_per_gpu": 10330.68, "total_tokens": 994664316 }, { "epoch": 0.6297824456114028, "grad_norm": 0.9081434607505798, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10074, "tokens_per_second_per_gpu": 10762.75, "total_tokens": 994762837 }, { "epoch": 0.6298449612403101, "grad_norm": 0.895675539970398, "learning_rate": 2e-05, "loss": 0.6374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10075, "tokens_per_second_per_gpu": 9558.13, "total_tokens": 994858966 }, { "epoch": 0.6299074768692173, "grad_norm": 0.892586350440979, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10076, "tokens_per_second_per_gpu": 10545.9, "total_tokens": 994956971 }, { "epoch": 0.6299699924981246, "grad_norm": 0.8914721012115479, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10077, "tokens_per_second_per_gpu": 10488.92, "total_tokens": 995054069 }, { "epoch": 0.6300325081270317, "grad_norm": 0.8932744860649109, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10078, "tokens_per_second_per_gpu": 9572.02, "total_tokens": 995151093 }, { "epoch": 0.630095023755939, "grad_norm": 0.8799780011177063, "learning_rate": 2e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10079, "tokens_per_second_per_gpu": 9904.59, "total_tokens": 995245255 }, { "epoch": 0.6301575393848462, "grad_norm": 0.8782058358192444, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10080, "tokens_per_second_per_gpu": 9889.51, "total_tokens": 995342963 }, { "epoch": 0.6302200550137534, "grad_norm": 0.9048380255699158, "learning_rate": 2e-05, "loss": 0.6684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10081, "tokens_per_second_per_gpu": 10724.07, "total_tokens": 995446434 }, { "epoch": 0.6302825706426607, "grad_norm": 0.8556200861930847, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10082, "tokens_per_second_per_gpu": 10693.92, "total_tokens": 995547097 }, { "epoch": 0.6303450862715679, "grad_norm": 0.9153466820716858, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10083, "tokens_per_second_per_gpu": 10940.62, "total_tokens": 995647076 }, { "epoch": 0.6304076019004751, "grad_norm": 0.8890314698219299, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10084, "tokens_per_second_per_gpu": 10764.95, "total_tokens": 995748657 }, { "epoch": 0.6304701175293823, "grad_norm": 0.9054707288742065, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10085, "tokens_per_second_per_gpu": 10439.84, "total_tokens": 995847753 }, { "epoch": 0.6305326331582896, "grad_norm": 0.8723551034927368, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10086, "tokens_per_second_per_gpu": 10997.42, "total_tokens": 995949553 }, { "epoch": 0.6305951487871968, "grad_norm": 0.8962733149528503, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10087, "tokens_per_second_per_gpu": 10655.94, "total_tokens": 996046120 }, { "epoch": 0.630657664416104, "grad_norm": 0.8947820067405701, "learning_rate": 2e-05, "loss": 0.6536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10088, "tokens_per_second_per_gpu": 10289.95, "total_tokens": 996146193 }, { "epoch": 0.6307201800450113, "grad_norm": 0.8670711517333984, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10089, "tokens_per_second_per_gpu": 10694.69, "total_tokens": 996246508 }, { "epoch": 0.6307826956739184, "grad_norm": 0.8799389004707336, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10090, "tokens_per_second_per_gpu": 11097.37, "total_tokens": 996349199 }, { "epoch": 0.6308452113028257, "grad_norm": 0.9318959712982178, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10091, "tokens_per_second_per_gpu": 11101.32, "total_tokens": 996447532 }, { "epoch": 0.6309077269317329, "grad_norm": 0.90108722448349, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10092, "tokens_per_second_per_gpu": 10234.28, "total_tokens": 996545338 }, { "epoch": 0.6309702425606402, "grad_norm": 0.8998445272445679, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10093, "tokens_per_second_per_gpu": 10446.67, "total_tokens": 996643175 }, { "epoch": 0.6310327581895474, "grad_norm": 0.900448203086853, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10094, "tokens_per_second_per_gpu": 10799.46, "total_tokens": 996743948 }, { "epoch": 0.6310952738184546, "grad_norm": 0.8844392895698547, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10095, "tokens_per_second_per_gpu": 10195.52, "total_tokens": 996840509 }, { "epoch": 0.6311577894473619, "grad_norm": 0.9208524227142334, "learning_rate": 2e-05, "loss": 0.6496, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10096, "tokens_per_second_per_gpu": 10887.11, "total_tokens": 996942405 }, { "epoch": 0.631220305076269, "grad_norm": 0.9532647728919983, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10097, "tokens_per_second_per_gpu": 10406.18, "total_tokens": 997034334 }, { "epoch": 0.6312828207051763, "grad_norm": 0.8940685987472534, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10098, "tokens_per_second_per_gpu": 10729.77, "total_tokens": 997130085 }, { "epoch": 0.6313453363340835, "grad_norm": 0.9317903518676758, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10099, "tokens_per_second_per_gpu": 10357.93, "total_tokens": 997226626 }, { "epoch": 0.6314078519629908, "grad_norm": 0.9176300168037415, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10100, "tokens_per_second_per_gpu": 10238.24, "total_tokens": 997325117 }, { "epoch": 0.631470367591898, "grad_norm": 0.8725945353507996, "learning_rate": 2e-05, "loss": 0.6082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10101, "tokens_per_second_per_gpu": 10717.74, "total_tokens": 997424527 }, { "epoch": 0.6315328832208053, "grad_norm": 0.9185271263122559, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10102, "tokens_per_second_per_gpu": 10560.67, "total_tokens": 997524270 }, { "epoch": 0.6315953988497124, "grad_norm": 0.9259889125823975, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10103, "tokens_per_second_per_gpu": 10817.62, "total_tokens": 997622517 }, { "epoch": 0.6316579144786196, "grad_norm": 0.8963137865066528, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10104, "tokens_per_second_per_gpu": 10916.58, "total_tokens": 997721228 }, { "epoch": 0.6317204301075269, "grad_norm": 0.8864293694496155, "learning_rate": 2e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10105, "tokens_per_second_per_gpu": 10755.59, "total_tokens": 997819397 }, { "epoch": 0.6317829457364341, "grad_norm": 0.8770402073860168, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10106, "tokens_per_second_per_gpu": 9929.62, "total_tokens": 997915993 }, { "epoch": 0.6318454613653414, "grad_norm": 0.8994191884994507, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10107, "tokens_per_second_per_gpu": 10208.79, "total_tokens": 998013771 }, { "epoch": 0.6319079769942486, "grad_norm": 0.8881841897964478, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10108, "tokens_per_second_per_gpu": 10066.99, "total_tokens": 998109868 }, { "epoch": 0.6319704926231557, "grad_norm": 0.9143503904342651, "learning_rate": 2e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10109, "tokens_per_second_per_gpu": 10950.77, "total_tokens": 998210918 }, { "epoch": 0.632033008252063, "grad_norm": 0.8736625909805298, "learning_rate": 2e-05, "loss": 0.5966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10110, "tokens_per_second_per_gpu": 10322.79, "total_tokens": 998306876 }, { "epoch": 0.6320955238809702, "grad_norm": 0.9857970476150513, "learning_rate": 2e-05, "loss": 0.6653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10111, "tokens_per_second_per_gpu": 10753.56, "total_tokens": 998403707 }, { "epoch": 0.6321580395098775, "grad_norm": 0.8791106343269348, "learning_rate": 2e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10112, "tokens_per_second_per_gpu": 10607.14, "total_tokens": 998502264 }, { "epoch": 0.6322205551387847, "grad_norm": 0.9390974044799805, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10113, "tokens_per_second_per_gpu": 10506.29, "total_tokens": 998600472 }, { "epoch": 0.632283070767692, "grad_norm": 0.8596065640449524, "learning_rate": 2e-05, "loss": 0.5792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10114, "tokens_per_second_per_gpu": 10915.03, "total_tokens": 998703852 }, { "epoch": 0.6323455863965991, "grad_norm": 0.8392505049705505, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10115, "tokens_per_second_per_gpu": 10721.93, "total_tokens": 998806739 }, { "epoch": 0.6324081020255063, "grad_norm": 0.9233541488647461, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10116, "tokens_per_second_per_gpu": 9599.91, "total_tokens": 998900251 }, { "epoch": 0.6324706176544136, "grad_norm": 0.877058207988739, "learning_rate": 2e-05, "loss": 0.5901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10117, "tokens_per_second_per_gpu": 9704.61, "total_tokens": 998992506 }, { "epoch": 0.6325331332833208, "grad_norm": 0.867486834526062, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10118, "tokens_per_second_per_gpu": 10745.67, "total_tokens": 999092490 }, { "epoch": 0.6325956489122281, "grad_norm": 0.844262421131134, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10119, "tokens_per_second_per_gpu": 11015.96, "total_tokens": 999191887 }, { "epoch": 0.6326581645411353, "grad_norm": 0.9296284317970276, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10120, "tokens_per_second_per_gpu": 10704.18, "total_tokens": 999290909 }, { "epoch": 0.6327206801700425, "grad_norm": 0.9427357316017151, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10121, "tokens_per_second_per_gpu": 9965.3, "total_tokens": 999385201 }, { "epoch": 0.6327831957989497, "grad_norm": 0.8775333166122437, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10122, "tokens_per_second_per_gpu": 10083.58, "total_tokens": 999478492 }, { "epoch": 0.632845711427857, "grad_norm": 0.9198883175849915, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10123, "tokens_per_second_per_gpu": 10060.03, "total_tokens": 999577450 }, { "epoch": 0.6329082270567642, "grad_norm": 0.9449241757392883, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10124, "tokens_per_second_per_gpu": 10245.85, "total_tokens": 999671131 }, { "epoch": 0.6329707426856714, "grad_norm": 0.88682621717453, "learning_rate": 2e-05, "loss": 0.6487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10125, "tokens_per_second_per_gpu": 10101.77, "total_tokens": 999767518 }, { "epoch": 0.6330332583145787, "grad_norm": 0.8723263740539551, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10126, "tokens_per_second_per_gpu": 10713.63, "total_tokens": 999870954 }, { "epoch": 0.6330957739434858, "grad_norm": 0.8868517875671387, "learning_rate": 2e-05, "loss": 0.5681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10127, "tokens_per_second_per_gpu": 10578.26, "total_tokens": 999968920 }, { "epoch": 0.6331582895723931, "grad_norm": 0.9117914438247681, "learning_rate": 2e-05, "loss": 0.6948, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10128, "tokens_per_second_per_gpu": 10492.91, "total_tokens": 1000067840 }, { "epoch": 0.6332208052013003, "grad_norm": 0.8826907873153687, "learning_rate": 2e-05, "loss": 0.605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10129, "tokens_per_second_per_gpu": 11063.96, "total_tokens": 1000171024 }, { "epoch": 0.6332833208302076, "grad_norm": 0.8765431642532349, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10130, "tokens_per_second_per_gpu": 10242.63, "total_tokens": 1000267158 }, { "epoch": 0.6333458364591148, "grad_norm": 0.8645956516265869, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10131, "tokens_per_second_per_gpu": 9463.01, "total_tokens": 1000364424 }, { "epoch": 0.633408352088022, "grad_norm": 0.8746164441108704, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10132, "tokens_per_second_per_gpu": 10050.9, "total_tokens": 1000458141 }, { "epoch": 0.6334708677169293, "grad_norm": 0.9071776270866394, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10133, "tokens_per_second_per_gpu": 10333.35, "total_tokens": 1000551459 }, { "epoch": 0.6335333833458364, "grad_norm": 0.8912730813026428, "learning_rate": 2e-05, "loss": 0.6823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10134, "tokens_per_second_per_gpu": 10822.68, "total_tokens": 1000650233 }, { "epoch": 0.6335958989747437, "grad_norm": 0.8637313842773438, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10135, "tokens_per_second_per_gpu": 11118.73, "total_tokens": 1000753181 }, { "epoch": 0.6336584146036509, "grad_norm": 1.3285751342773438, "learning_rate": 2e-05, "loss": 0.6683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10136, "tokens_per_second_per_gpu": 10671.25, "total_tokens": 1000853728 }, { "epoch": 0.6337209302325582, "grad_norm": 0.8351511359214783, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10137, "tokens_per_second_per_gpu": 10981.26, "total_tokens": 1000956652 }, { "epoch": 0.6337834458614654, "grad_norm": 0.8944922685623169, "learning_rate": 2e-05, "loss": 0.6789, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10138, "tokens_per_second_per_gpu": 10264.55, "total_tokens": 1001052097 }, { "epoch": 0.6338459614903726, "grad_norm": 0.8842337727546692, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10139, "tokens_per_second_per_gpu": 10606.25, "total_tokens": 1001150562 }, { "epoch": 0.6339084771192798, "grad_norm": 0.9264272451400757, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10140, "tokens_per_second_per_gpu": 10386.15, "total_tokens": 1001246656 }, { "epoch": 0.633970992748187, "grad_norm": 0.880230724811554, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10141, "tokens_per_second_per_gpu": 10909.62, "total_tokens": 1001344419 }, { "epoch": 0.6340335083770943, "grad_norm": 0.8813479542732239, "learning_rate": 2e-05, "loss": 0.6738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10142, "tokens_per_second_per_gpu": 10639.81, "total_tokens": 1001447209 }, { "epoch": 0.6340960240060015, "grad_norm": 1.0138540267944336, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10143, "tokens_per_second_per_gpu": 10724.5, "total_tokens": 1001548350 }, { "epoch": 0.6341585396349088, "grad_norm": 1.1195299625396729, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10144, "tokens_per_second_per_gpu": 10597.5, "total_tokens": 1001646594 }, { "epoch": 0.634221055263816, "grad_norm": 0.9296412467956543, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10145, "tokens_per_second_per_gpu": 11219.22, "total_tokens": 1001744389 }, { "epoch": 0.6342835708927231, "grad_norm": 0.8563715815544128, "learning_rate": 2e-05, "loss": 0.5887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10146, "tokens_per_second_per_gpu": 10042.24, "total_tokens": 1001839827 }, { "epoch": 0.6343460865216304, "grad_norm": 0.9530136585235596, "learning_rate": 2e-05, "loss": 0.5619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10147, "tokens_per_second_per_gpu": 9914.55, "total_tokens": 1001934547 }, { "epoch": 0.6344086021505376, "grad_norm": 0.8938891291618347, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10148, "tokens_per_second_per_gpu": 11075.81, "total_tokens": 1002034913 }, { "epoch": 0.6344711177794449, "grad_norm": 0.9420276284217834, "learning_rate": 2e-05, "loss": 0.6394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10149, "tokens_per_second_per_gpu": 10388.51, "total_tokens": 1002134168 }, { "epoch": 0.6345336334083521, "grad_norm": 0.8900229334831238, "learning_rate": 2e-05, "loss": 0.6483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10150, "tokens_per_second_per_gpu": 10496.84, "total_tokens": 1002236352 }, { "epoch": 0.6345961490372594, "grad_norm": 0.8839912414550781, "learning_rate": 2e-05, "loss": 0.6302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10151, "tokens_per_second_per_gpu": 10897.79, "total_tokens": 1002339459 }, { "epoch": 0.6346586646661665, "grad_norm": 0.93154376745224, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10152, "tokens_per_second_per_gpu": 10260.55, "total_tokens": 1002434990 }, { "epoch": 0.6347211802950737, "grad_norm": 0.9191861748695374, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10153, "tokens_per_second_per_gpu": 10430.3, "total_tokens": 1002531517 }, { "epoch": 0.634783695923981, "grad_norm": 0.9037044644355774, "learning_rate": 2e-05, "loss": 0.6082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10154, "tokens_per_second_per_gpu": 10454.94, "total_tokens": 1002627293 }, { "epoch": 0.6348462115528882, "grad_norm": 0.8659189939498901, "learning_rate": 2e-05, "loss": 0.5744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10155, "tokens_per_second_per_gpu": 10013.92, "total_tokens": 1002724130 }, { "epoch": 0.6349087271817955, "grad_norm": 0.8883634805679321, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10156, "tokens_per_second_per_gpu": 10540.49, "total_tokens": 1002821451 }, { "epoch": 0.6349712428107027, "grad_norm": 0.9110449552536011, "learning_rate": 2e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10157, "tokens_per_second_per_gpu": 9271.89, "total_tokens": 1002915639 }, { "epoch": 0.6350337584396099, "grad_norm": 0.9362255334854126, "learning_rate": 2e-05, "loss": 0.5852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10158, "tokens_per_second_per_gpu": 10309.86, "total_tokens": 1003010917 }, { "epoch": 0.6350962740685171, "grad_norm": 0.9281671643257141, "learning_rate": 2e-05, "loss": 0.5951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10159, "tokens_per_second_per_gpu": 9530.67, "total_tokens": 1003101499 }, { "epoch": 0.6351587896974243, "grad_norm": 0.9201387763023376, "learning_rate": 2e-05, "loss": 0.6552, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10160, "tokens_per_second_per_gpu": 11139.91, "total_tokens": 1003207126 }, { "epoch": 0.6352213053263316, "grad_norm": 0.9252656698226929, "learning_rate": 2e-05, "loss": 0.6053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10161, "tokens_per_second_per_gpu": 10579.98, "total_tokens": 1003299006 }, { "epoch": 0.6352838209552388, "grad_norm": 0.9466423392295837, "learning_rate": 2e-05, "loss": 0.7094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10162, "tokens_per_second_per_gpu": 11311.38, "total_tokens": 1003403785 }, { "epoch": 0.6353463365841461, "grad_norm": 0.9356610178947449, "learning_rate": 2e-05, "loss": 0.6522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10163, "tokens_per_second_per_gpu": 10963.08, "total_tokens": 1003503572 }, { "epoch": 0.6354088522130532, "grad_norm": 0.8546147346496582, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10164, "tokens_per_second_per_gpu": 10802.21, "total_tokens": 1003603054 }, { "epoch": 0.6354713678419605, "grad_norm": 0.8775519728660583, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10165, "tokens_per_second_per_gpu": 10518.12, "total_tokens": 1003704560 }, { "epoch": 0.6355338834708677, "grad_norm": 0.9281776547431946, "learning_rate": 2e-05, "loss": 0.6591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10166, "tokens_per_second_per_gpu": 10036.59, "total_tokens": 1003804026 }, { "epoch": 0.635596399099775, "grad_norm": 0.8506978750228882, "learning_rate": 2e-05, "loss": 0.5922, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10167, "tokens_per_second_per_gpu": 10209.51, "total_tokens": 1003902922 }, { "epoch": 0.6356589147286822, "grad_norm": 0.9250563979148865, "learning_rate": 2e-05, "loss": 0.687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10168, "tokens_per_second_per_gpu": 10882.52, "total_tokens": 1004004444 }, { "epoch": 0.6357214303575894, "grad_norm": 0.9298645257949829, "learning_rate": 2e-05, "loss": 0.6209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10169, "tokens_per_second_per_gpu": 10216.42, "total_tokens": 1004102861 }, { "epoch": 0.6357839459864967, "grad_norm": 0.8673108220100403, "learning_rate": 2e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10170, "tokens_per_second_per_gpu": 11139.71, "total_tokens": 1004203800 }, { "epoch": 0.6358464616154038, "grad_norm": 0.9184547066688538, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10171, "tokens_per_second_per_gpu": 10453.11, "total_tokens": 1004305437 }, { "epoch": 0.6359089772443111, "grad_norm": 0.9177313446998596, "learning_rate": 2e-05, "loss": 0.6757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10172, "tokens_per_second_per_gpu": 10451.0, "total_tokens": 1004403745 }, { "epoch": 0.6359714928732183, "grad_norm": 0.9504458904266357, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10173, "tokens_per_second_per_gpu": 9793.75, "total_tokens": 1004497315 }, { "epoch": 0.6360340085021255, "grad_norm": 0.8874741792678833, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10174, "tokens_per_second_per_gpu": 11383.87, "total_tokens": 1004603307 }, { "epoch": 0.6360965241310328, "grad_norm": 0.889884352684021, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10175, "tokens_per_second_per_gpu": 10649.87, "total_tokens": 1004707278 }, { "epoch": 0.63615903975994, "grad_norm": 0.89464271068573, "learning_rate": 2e-05, "loss": 0.5551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10176, "tokens_per_second_per_gpu": 9418.45, "total_tokens": 1004800275 }, { "epoch": 0.6362215553888472, "grad_norm": 0.8586786985397339, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10177, "tokens_per_second_per_gpu": 10309.06, "total_tokens": 1004899498 }, { "epoch": 0.6362840710177544, "grad_norm": 0.9160348773002625, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10178, "tokens_per_second_per_gpu": 10729.36, "total_tokens": 1004994263 }, { "epoch": 0.6363465866466617, "grad_norm": 0.9285356402397156, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10179, "tokens_per_second_per_gpu": 10227.07, "total_tokens": 1005089462 }, { "epoch": 0.6364091022755689, "grad_norm": 0.8577059507369995, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10180, "tokens_per_second_per_gpu": 10818.16, "total_tokens": 1005192076 }, { "epoch": 0.6364716179044762, "grad_norm": 0.9518497586250305, "learning_rate": 2e-05, "loss": 0.6818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10181, "tokens_per_second_per_gpu": 10849.9, "total_tokens": 1005294425 }, { "epoch": 0.6365341335333834, "grad_norm": 0.8977240324020386, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10182, "tokens_per_second_per_gpu": 10702.09, "total_tokens": 1005394406 }, { "epoch": 0.6365966491622905, "grad_norm": 0.8725643754005432, "learning_rate": 2e-05, "loss": 0.6013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10183, "tokens_per_second_per_gpu": 10137.82, "total_tokens": 1005494117 }, { "epoch": 0.6366591647911978, "grad_norm": 0.8418813943862915, "learning_rate": 2e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10184, "tokens_per_second_per_gpu": 10651.65, "total_tokens": 1005594296 }, { "epoch": 0.636721680420105, "grad_norm": 0.8534359931945801, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10185, "tokens_per_second_per_gpu": 10488.42, "total_tokens": 1005691947 }, { "epoch": 0.6367841960490123, "grad_norm": 0.9168412685394287, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10186, "tokens_per_second_per_gpu": 10722.71, "total_tokens": 1005792558 }, { "epoch": 0.6368467116779195, "grad_norm": 0.8923185467720032, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10187, "tokens_per_second_per_gpu": 9776.75, "total_tokens": 1005885773 }, { "epoch": 0.6369092273068268, "grad_norm": 0.8550992012023926, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10188, "tokens_per_second_per_gpu": 10308.92, "total_tokens": 1005985141 }, { "epoch": 0.6369717429357339, "grad_norm": 0.8587422370910645, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10189, "tokens_per_second_per_gpu": 10801.01, "total_tokens": 1006086252 }, { "epoch": 0.6370342585646411, "grad_norm": 0.8869762420654297, "learning_rate": 2e-05, "loss": 0.6283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10190, "tokens_per_second_per_gpu": 10908.75, "total_tokens": 1006187330 }, { "epoch": 0.6370967741935484, "grad_norm": 0.8842676281929016, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10191, "tokens_per_second_per_gpu": 10973.79, "total_tokens": 1006285207 }, { "epoch": 0.6371592898224556, "grad_norm": 0.8783254623413086, "learning_rate": 2e-05, "loss": 0.6837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10192, "tokens_per_second_per_gpu": 10556.4, "total_tokens": 1006384863 }, { "epoch": 0.6372218054513629, "grad_norm": 0.9037873148918152, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10193, "tokens_per_second_per_gpu": 9845.7, "total_tokens": 1006481169 }, { "epoch": 0.6372843210802701, "grad_norm": 0.900358259677887, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10194, "tokens_per_second_per_gpu": 10390.09, "total_tokens": 1006577431 }, { "epoch": 0.6373468367091772, "grad_norm": 0.9008227586746216, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10195, "tokens_per_second_per_gpu": 10563.86, "total_tokens": 1006673740 }, { "epoch": 0.6374093523380845, "grad_norm": 0.8990154266357422, "learning_rate": 2e-05, "loss": 0.5863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10196, "tokens_per_second_per_gpu": 10020.0, "total_tokens": 1006769199 }, { "epoch": 0.6374718679669917, "grad_norm": 0.9172396063804626, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10197, "tokens_per_second_per_gpu": 10010.45, "total_tokens": 1006865326 }, { "epoch": 0.637534383595899, "grad_norm": 0.9044919610023499, "learning_rate": 2e-05, "loss": 0.6643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10198, "tokens_per_second_per_gpu": 10474.67, "total_tokens": 1006964793 }, { "epoch": 0.6375968992248062, "grad_norm": 0.8775956034660339, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10199, "tokens_per_second_per_gpu": 11114.29, "total_tokens": 1007061334 }, { "epoch": 0.6376594148537135, "grad_norm": 0.9105966091156006, "learning_rate": 2e-05, "loss": 0.659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10200, "tokens_per_second_per_gpu": 10232.84, "total_tokens": 1007157311 }, { "epoch": 0.6377219304826206, "grad_norm": 0.8814555406570435, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10201, "tokens_per_second_per_gpu": 10663.95, "total_tokens": 1007256162 }, { "epoch": 0.6377844461115278, "grad_norm": 0.8788281679153442, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10202, "tokens_per_second_per_gpu": 9842.05, "total_tokens": 1007353112 }, { "epoch": 0.6378469617404351, "grad_norm": 0.8830656409263611, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10203, "tokens_per_second_per_gpu": 10714.27, "total_tokens": 1007450870 }, { "epoch": 0.6379094773693423, "grad_norm": 0.9488105177879333, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10204, "tokens_per_second_per_gpu": 10655.23, "total_tokens": 1007548886 }, { "epoch": 0.6379719929982496, "grad_norm": 0.8867582082748413, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10205, "tokens_per_second_per_gpu": 9809.22, "total_tokens": 1007645233 }, { "epoch": 0.6380345086271568, "grad_norm": 0.8405340313911438, "learning_rate": 2e-05, "loss": 0.5566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10206, "tokens_per_second_per_gpu": 10836.33, "total_tokens": 1007744036 }, { "epoch": 0.6380970242560641, "grad_norm": 0.9319478869438171, "learning_rate": 2e-05, "loss": 0.6604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10207, "tokens_per_second_per_gpu": 10335.46, "total_tokens": 1007839310 }, { "epoch": 0.6381595398849712, "grad_norm": 0.9198912382125854, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10208, "tokens_per_second_per_gpu": 9721.46, "total_tokens": 1007935933 }, { "epoch": 0.6382220555138785, "grad_norm": 0.9088091850280762, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10209, "tokens_per_second_per_gpu": 11271.35, "total_tokens": 1008041174 }, { "epoch": 0.6382845711427857, "grad_norm": 0.8669588565826416, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10210, "tokens_per_second_per_gpu": 10492.23, "total_tokens": 1008138510 }, { "epoch": 0.6383470867716929, "grad_norm": 0.9316696524620056, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10211, "tokens_per_second_per_gpu": 10295.29, "total_tokens": 1008235262 }, { "epoch": 0.6384096024006002, "grad_norm": 0.8959510326385498, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10212, "tokens_per_second_per_gpu": 10355.24, "total_tokens": 1008328743 }, { "epoch": 0.6384721180295074, "grad_norm": 0.9074516296386719, "learning_rate": 2e-05, "loss": 0.6685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10213, "tokens_per_second_per_gpu": 9699.76, "total_tokens": 1008426567 }, { "epoch": 0.6385346336584146, "grad_norm": 0.8879857063293457, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10214, "tokens_per_second_per_gpu": 10515.07, "total_tokens": 1008525149 }, { "epoch": 0.6385971492873218, "grad_norm": 0.9119600653648376, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10215, "tokens_per_second_per_gpu": 10730.76, "total_tokens": 1008625169 }, { "epoch": 0.638659664916229, "grad_norm": 0.8653907775878906, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10216, "tokens_per_second_per_gpu": 10808.37, "total_tokens": 1008728562 }, { "epoch": 0.6387221805451363, "grad_norm": 0.8877859711647034, "learning_rate": 2e-05, "loss": 0.6355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10217, "tokens_per_second_per_gpu": 10855.03, "total_tokens": 1008829951 }, { "epoch": 0.6387846961740435, "grad_norm": 0.8910562992095947, "learning_rate": 2e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10218, "tokens_per_second_per_gpu": 10576.47, "total_tokens": 1008931474 }, { "epoch": 0.6388472118029508, "grad_norm": 0.9135209918022156, "learning_rate": 2e-05, "loss": 0.5797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10219, "tokens_per_second_per_gpu": 13016.57, "total_tokens": 1009020655 }, { "epoch": 0.6389097274318579, "grad_norm": 0.885432243347168, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10220, "tokens_per_second_per_gpu": 9975.08, "total_tokens": 1009114964 }, { "epoch": 0.6389722430607652, "grad_norm": 0.9157154560089111, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10221, "tokens_per_second_per_gpu": 10168.77, "total_tokens": 1009211992 }, { "epoch": 0.6390347586896724, "grad_norm": 0.8470210433006287, "learning_rate": 2e-05, "loss": 0.5785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10222, "tokens_per_second_per_gpu": 10527.57, "total_tokens": 1009306978 }, { "epoch": 0.6390972743185797, "grad_norm": 0.8877073526382446, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10223, "tokens_per_second_per_gpu": 10964.45, "total_tokens": 1009409919 }, { "epoch": 0.6391597899474869, "grad_norm": 0.8700152039527893, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10224, "tokens_per_second_per_gpu": 10695.97, "total_tokens": 1009510494 }, { "epoch": 0.6392223055763941, "grad_norm": 0.8514605760574341, "learning_rate": 2e-05, "loss": 0.578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10225, "tokens_per_second_per_gpu": 9886.37, "total_tokens": 1009605784 }, { "epoch": 0.6392848212053013, "grad_norm": 0.8812305927276611, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10226, "tokens_per_second_per_gpu": 10985.01, "total_tokens": 1009706484 }, { "epoch": 0.6393473368342085, "grad_norm": 0.8916809558868408, "learning_rate": 2e-05, "loss": 0.6085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10227, "tokens_per_second_per_gpu": 10544.12, "total_tokens": 1009804191 }, { "epoch": 0.6394098524631158, "grad_norm": 0.8876069188117981, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10228, "tokens_per_second_per_gpu": 10558.72, "total_tokens": 1009900490 }, { "epoch": 0.639472368092023, "grad_norm": 0.8759442567825317, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10229, "tokens_per_second_per_gpu": 10295.89, "total_tokens": 1009998623 }, { "epoch": 0.6395348837209303, "grad_norm": 0.8521327376365662, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10230, "tokens_per_second_per_gpu": 11079.98, "total_tokens": 1010100078 }, { "epoch": 0.6395973993498375, "grad_norm": 0.9241762161254883, "learning_rate": 2e-05, "loss": 0.6514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10231, "tokens_per_second_per_gpu": 10906.89, "total_tokens": 1010198509 }, { "epoch": 0.6396599149787446, "grad_norm": 0.8904435038566589, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10232, "tokens_per_second_per_gpu": 10171.17, "total_tokens": 1010294009 }, { "epoch": 0.6397224306076519, "grad_norm": 0.8731220960617065, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10233, "tokens_per_second_per_gpu": 9553.1, "total_tokens": 1010388291 }, { "epoch": 0.6397849462365591, "grad_norm": 0.9461123943328857, "learning_rate": 2e-05, "loss": 0.6013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10234, "tokens_per_second_per_gpu": 10459.9, "total_tokens": 1010487135 }, { "epoch": 0.6398474618654664, "grad_norm": 0.889565110206604, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10235, "tokens_per_second_per_gpu": 10487.12, "total_tokens": 1010587965 }, { "epoch": 0.6399099774943736, "grad_norm": 0.8807206153869629, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10236, "tokens_per_second_per_gpu": 11106.39, "total_tokens": 1010686625 }, { "epoch": 0.6399724931232809, "grad_norm": 0.8905945420265198, "learning_rate": 2e-05, "loss": 0.6178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10237, "tokens_per_second_per_gpu": 10523.79, "total_tokens": 1010786376 }, { "epoch": 0.640035008752188, "grad_norm": 0.9127029776573181, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10238, "tokens_per_second_per_gpu": 11009.45, "total_tokens": 1010886041 }, { "epoch": 0.6400975243810952, "grad_norm": 0.9018547534942627, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10239, "tokens_per_second_per_gpu": 10883.36, "total_tokens": 1010986972 }, { "epoch": 0.6401600400100025, "grad_norm": 0.9008487462997437, "learning_rate": 2e-05, "loss": 0.6795, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10240, "tokens_per_second_per_gpu": 11839.69, "total_tokens": 1011090110 }, { "epoch": 0.6402225556389097, "grad_norm": 0.8963140249252319, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10241, "tokens_per_second_per_gpu": 10897.16, "total_tokens": 1011192773 }, { "epoch": 0.640285071267817, "grad_norm": 0.8841565847396851, "learning_rate": 2e-05, "loss": 0.6502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10242, "tokens_per_second_per_gpu": 10414.58, "total_tokens": 1011293926 }, { "epoch": 0.6403475868967242, "grad_norm": 0.8799310326576233, "learning_rate": 2e-05, "loss": 0.6216, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10243, "tokens_per_second_per_gpu": 10503.56, "total_tokens": 1011393801 }, { "epoch": 0.6404101025256314, "grad_norm": 0.9042180776596069, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10244, "tokens_per_second_per_gpu": 10926.96, "total_tokens": 1011496874 }, { "epoch": 0.6404726181545386, "grad_norm": 0.9048487544059753, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10245, "tokens_per_second_per_gpu": 9846.79, "total_tokens": 1011595558 }, { "epoch": 0.6405351337834458, "grad_norm": 0.8879871368408203, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10246, "tokens_per_second_per_gpu": 10856.54, "total_tokens": 1011692740 }, { "epoch": 0.6405976494123531, "grad_norm": 0.9683266878128052, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10247, "tokens_per_second_per_gpu": 10890.09, "total_tokens": 1011792751 }, { "epoch": 0.6406601650412603, "grad_norm": 0.8869860172271729, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10248, "tokens_per_second_per_gpu": 11166.22, "total_tokens": 1011890752 }, { "epoch": 0.6407226806701676, "grad_norm": 0.9615263938903809, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10249, "tokens_per_second_per_gpu": 10200.0, "total_tokens": 1011985526 }, { "epoch": 0.6407851962990748, "grad_norm": 0.9039618372917175, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10250, "tokens_per_second_per_gpu": 10907.49, "total_tokens": 1012087254 }, { "epoch": 0.640847711927982, "grad_norm": 0.8983783721923828, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10251, "tokens_per_second_per_gpu": 10412.9, "total_tokens": 1012186298 }, { "epoch": 0.6409102275568892, "grad_norm": 0.8857883214950562, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10252, "tokens_per_second_per_gpu": 10392.01, "total_tokens": 1012283507 }, { "epoch": 0.6409727431857964, "grad_norm": 0.9798785448074341, "learning_rate": 2e-05, "loss": 0.6549, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10253, "tokens_per_second_per_gpu": 9838.31, "total_tokens": 1012378962 }, { "epoch": 0.6410352588147037, "grad_norm": 0.8823071122169495, "learning_rate": 2e-05, "loss": 0.5813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10254, "tokens_per_second_per_gpu": 10559.25, "total_tokens": 1012477167 }, { "epoch": 0.6410977744436109, "grad_norm": 0.852687418460846, "learning_rate": 2e-05, "loss": 0.6317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10255, "tokens_per_second_per_gpu": 10577.0, "total_tokens": 1012577935 }, { "epoch": 0.6411602900725182, "grad_norm": 0.8907811045646667, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10256, "tokens_per_second_per_gpu": 10746.02, "total_tokens": 1012678040 }, { "epoch": 0.6412228057014253, "grad_norm": 0.9050099849700928, "learning_rate": 2e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10257, "tokens_per_second_per_gpu": 9915.83, "total_tokens": 1012774184 }, { "epoch": 0.6412853213303326, "grad_norm": 0.8929342031478882, "learning_rate": 2e-05, "loss": 0.581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10258, "tokens_per_second_per_gpu": 9739.86, "total_tokens": 1012870125 }, { "epoch": 0.6413478369592398, "grad_norm": 0.8834391236305237, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10259, "tokens_per_second_per_gpu": 10805.86, "total_tokens": 1012966333 }, { "epoch": 0.641410352588147, "grad_norm": 0.8747747540473938, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10260, "tokens_per_second_per_gpu": 10718.35, "total_tokens": 1013062892 }, { "epoch": 0.6414728682170543, "grad_norm": 0.9110782742500305, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10261, "tokens_per_second_per_gpu": 10145.0, "total_tokens": 1013160935 }, { "epoch": 0.6415353838459615, "grad_norm": 0.9404491186141968, "learning_rate": 2e-05, "loss": 0.5911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10262, "tokens_per_second_per_gpu": 10216.86, "total_tokens": 1013254459 }, { "epoch": 0.6415978994748687, "grad_norm": 0.8892062902450562, "learning_rate": 2e-05, "loss": 0.5952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10263, "tokens_per_second_per_gpu": 9150.91, "total_tokens": 1013348089 }, { "epoch": 0.6416604151037759, "grad_norm": 0.8737387657165527, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10264, "tokens_per_second_per_gpu": 10897.63, "total_tokens": 1013448089 }, { "epoch": 0.6417229307326832, "grad_norm": 0.9018800258636475, "learning_rate": 2e-05, "loss": 0.5917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10265, "tokens_per_second_per_gpu": 10255.35, "total_tokens": 1013540843 }, { "epoch": 0.6417854463615904, "grad_norm": 0.9412946701049805, "learning_rate": 2e-05, "loss": 0.6318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10266, "tokens_per_second_per_gpu": 10335.01, "total_tokens": 1013639900 }, { "epoch": 0.6418479619904977, "grad_norm": 0.9877902269363403, "learning_rate": 2e-05, "loss": 0.6458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10267, "tokens_per_second_per_gpu": 10320.91, "total_tokens": 1013736587 }, { "epoch": 0.6419104776194049, "grad_norm": 0.8743343949317932, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10268, "tokens_per_second_per_gpu": 10110.62, "total_tokens": 1013835752 }, { "epoch": 0.641972993248312, "grad_norm": 0.9195433259010315, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10269, "tokens_per_second_per_gpu": 10512.67, "total_tokens": 1013933583 }, { "epoch": 0.6420355088772193, "grad_norm": 0.9291447401046753, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10270, "tokens_per_second_per_gpu": 9166.41, "total_tokens": 1014027282 }, { "epoch": 0.6420980245061265, "grad_norm": 0.9439395070075989, "learning_rate": 2e-05, "loss": 0.6848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10271, "tokens_per_second_per_gpu": 10519.5, "total_tokens": 1014126790 }, { "epoch": 0.6421605401350338, "grad_norm": 0.9207051992416382, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10272, "tokens_per_second_per_gpu": 9605.66, "total_tokens": 1014221196 }, { "epoch": 0.642223055763941, "grad_norm": 0.8706796765327454, "learning_rate": 2e-05, "loss": 0.5735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10273, "tokens_per_second_per_gpu": 9751.19, "total_tokens": 1014313891 }, { "epoch": 0.6422855713928483, "grad_norm": 0.9230775237083435, "learning_rate": 2e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10274, "tokens_per_second_per_gpu": 10372.32, "total_tokens": 1014415072 }, { "epoch": 0.6423480870217554, "grad_norm": 0.8817582726478577, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10275, "tokens_per_second_per_gpu": 11031.68, "total_tokens": 1014517054 }, { "epoch": 0.6424106026506626, "grad_norm": 0.8749617338180542, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10276, "tokens_per_second_per_gpu": 10438.13, "total_tokens": 1014619986 }, { "epoch": 0.6424731182795699, "grad_norm": 0.8833463788032532, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10277, "tokens_per_second_per_gpu": 9630.27, "total_tokens": 1014716505 }, { "epoch": 0.6425356339084771, "grad_norm": 0.8799347877502441, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10278, "tokens_per_second_per_gpu": 10204.79, "total_tokens": 1014815684 }, { "epoch": 0.6425981495373844, "grad_norm": 0.8667165040969849, "learning_rate": 2e-05, "loss": 0.5993, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10279, "tokens_per_second_per_gpu": 9114.07, "total_tokens": 1014909136 }, { "epoch": 0.6426606651662916, "grad_norm": 0.9317855834960938, "learning_rate": 2e-05, "loss": 0.5807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10280, "tokens_per_second_per_gpu": 10195.95, "total_tokens": 1015004952 }, { "epoch": 0.6427231807951987, "grad_norm": 0.9020353555679321, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10281, "tokens_per_second_per_gpu": 10744.97, "total_tokens": 1015101468 }, { "epoch": 0.642785696424106, "grad_norm": 0.9115752577781677, "learning_rate": 2e-05, "loss": 0.6593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10282, "tokens_per_second_per_gpu": 10890.59, "total_tokens": 1015203408 }, { "epoch": 0.6428482120530132, "grad_norm": 0.9021733999252319, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10283, "tokens_per_second_per_gpu": 10257.85, "total_tokens": 1015298088 }, { "epoch": 0.6429107276819205, "grad_norm": 0.9113073348999023, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10284, "tokens_per_second_per_gpu": 10642.68, "total_tokens": 1015394899 }, { "epoch": 0.6429732433108277, "grad_norm": 0.9299665689468384, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10285, "tokens_per_second_per_gpu": 9726.28, "total_tokens": 1015485637 }, { "epoch": 0.643035758939735, "grad_norm": 0.8877792358398438, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10286, "tokens_per_second_per_gpu": 11181.21, "total_tokens": 1015587894 }, { "epoch": 0.6430982745686422, "grad_norm": 0.8859670758247375, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10287, "tokens_per_second_per_gpu": 10212.68, "total_tokens": 1015687569 }, { "epoch": 0.6431607901975493, "grad_norm": 0.8842058777809143, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10288, "tokens_per_second_per_gpu": 11102.78, "total_tokens": 1015787958 }, { "epoch": 0.6432233058264566, "grad_norm": 0.9077631831169128, "learning_rate": 2e-05, "loss": 0.6652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10289, "tokens_per_second_per_gpu": 10264.73, "total_tokens": 1015888845 }, { "epoch": 0.6432858214553638, "grad_norm": 0.8930439352989197, "learning_rate": 2e-05, "loss": 0.6205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10290, "tokens_per_second_per_gpu": 9725.3, "total_tokens": 1015982198 }, { "epoch": 0.6433483370842711, "grad_norm": 0.8974465131759644, "learning_rate": 2e-05, "loss": 0.6721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10291, "tokens_per_second_per_gpu": 10013.76, "total_tokens": 1016079806 }, { "epoch": 0.6434108527131783, "grad_norm": 0.8874708414077759, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10292, "tokens_per_second_per_gpu": 11234.32, "total_tokens": 1016182302 }, { "epoch": 0.6434733683420856, "grad_norm": 0.8962536454200745, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10293, "tokens_per_second_per_gpu": 10613.91, "total_tokens": 1016281883 }, { "epoch": 0.6435358839709927, "grad_norm": 0.890434741973877, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10294, "tokens_per_second_per_gpu": 9828.75, "total_tokens": 1016378315 }, { "epoch": 0.6435983995999, "grad_norm": 0.87840735912323, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10295, "tokens_per_second_per_gpu": 10178.64, "total_tokens": 1016476975 }, { "epoch": 0.6436609152288072, "grad_norm": 0.8786551356315613, "learning_rate": 2e-05, "loss": 0.6747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10296, "tokens_per_second_per_gpu": 10825.31, "total_tokens": 1016583103 }, { "epoch": 0.6437234308577144, "grad_norm": 0.8697987794876099, "learning_rate": 2e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10297, "tokens_per_second_per_gpu": 11059.89, "total_tokens": 1016684019 }, { "epoch": 0.6437859464866217, "grad_norm": 0.908251941204071, "learning_rate": 2e-05, "loss": 0.6775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10298, "tokens_per_second_per_gpu": 10523.26, "total_tokens": 1016782635 }, { "epoch": 0.6438484621155289, "grad_norm": 0.9182647466659546, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10299, "tokens_per_second_per_gpu": 10189.65, "total_tokens": 1016877104 }, { "epoch": 0.6439109777444361, "grad_norm": 0.8999600410461426, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10300, "tokens_per_second_per_gpu": 10792.53, "total_tokens": 1016981207 }, { "epoch": 0.6439734933733433, "grad_norm": 0.8575550317764282, "learning_rate": 2e-05, "loss": 0.5897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10301, "tokens_per_second_per_gpu": 10636.06, "total_tokens": 1017076641 }, { "epoch": 0.6440360090022506, "grad_norm": 0.9494052529335022, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10302, "tokens_per_second_per_gpu": 10186.58, "total_tokens": 1017177123 }, { "epoch": 0.6440985246311578, "grad_norm": 0.8860356211662292, "learning_rate": 2e-05, "loss": 0.6876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10303, "tokens_per_second_per_gpu": 10761.82, "total_tokens": 1017278576 }, { "epoch": 0.644161040260065, "grad_norm": 0.8504418730735779, "learning_rate": 2e-05, "loss": 0.5873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10304, "tokens_per_second_per_gpu": 10592.16, "total_tokens": 1017374459 }, { "epoch": 0.6442235558889723, "grad_norm": 0.8945097923278809, "learning_rate": 2e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10305, "tokens_per_second_per_gpu": 10337.17, "total_tokens": 1017471646 }, { "epoch": 0.6442860715178794, "grad_norm": 0.8914260864257812, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10306, "tokens_per_second_per_gpu": 10467.43, "total_tokens": 1017566848 }, { "epoch": 0.6443485871467867, "grad_norm": 0.8783039450645447, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10307, "tokens_per_second_per_gpu": 10862.5, "total_tokens": 1017666114 }, { "epoch": 0.6444111027756939, "grad_norm": 0.8636065125465393, "learning_rate": 2e-05, "loss": 0.6086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10308, "tokens_per_second_per_gpu": 10964.84, "total_tokens": 1017766972 }, { "epoch": 0.6444736184046012, "grad_norm": 0.8820681571960449, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10309, "tokens_per_second_per_gpu": 11256.34, "total_tokens": 1017870231 }, { "epoch": 0.6445361340335084, "grad_norm": 0.8847072720527649, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10310, "tokens_per_second_per_gpu": 11158.69, "total_tokens": 1017972732 }, { "epoch": 0.6445986496624156, "grad_norm": 0.8925856351852417, "learning_rate": 2e-05, "loss": 0.6634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10311, "tokens_per_second_per_gpu": 10704.48, "total_tokens": 1018069807 }, { "epoch": 0.6446611652913228, "grad_norm": 0.8890071511268616, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10312, "tokens_per_second_per_gpu": 9833.17, "total_tokens": 1018168176 }, { "epoch": 0.64472368092023, "grad_norm": 0.9126707315444946, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10313, "tokens_per_second_per_gpu": 11161.19, "total_tokens": 1018265625 }, { "epoch": 0.6447861965491373, "grad_norm": 0.8787087202072144, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10314, "tokens_per_second_per_gpu": 10066.45, "total_tokens": 1018365235 }, { "epoch": 0.6448487121780445, "grad_norm": 0.8865863084793091, "learning_rate": 2e-05, "loss": 0.6581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10315, "tokens_per_second_per_gpu": 10766.68, "total_tokens": 1018465973 }, { "epoch": 0.6449112278069518, "grad_norm": 0.8745901584625244, "learning_rate": 2e-05, "loss": 0.6714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10316, "tokens_per_second_per_gpu": 10702.05, "total_tokens": 1018565380 }, { "epoch": 0.644973743435859, "grad_norm": 0.9071268439292908, "learning_rate": 2e-05, "loss": 0.591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10317, "tokens_per_second_per_gpu": 9833.78, "total_tokens": 1018660478 }, { "epoch": 0.6450362590647661, "grad_norm": 0.8934338688850403, "learning_rate": 2e-05, "loss": 0.6206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10318, "tokens_per_second_per_gpu": 10219.64, "total_tokens": 1018753589 }, { "epoch": 0.6450987746936734, "grad_norm": 0.9030133485794067, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10319, "tokens_per_second_per_gpu": 10365.18, "total_tokens": 1018850979 }, { "epoch": 0.6451612903225806, "grad_norm": 0.8826854825019836, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10320, "tokens_per_second_per_gpu": 9687.32, "total_tokens": 1018945117 }, { "epoch": 0.6452238059514879, "grad_norm": 0.9410048723220825, "learning_rate": 2e-05, "loss": 0.622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10321, "tokens_per_second_per_gpu": 9672.39, "total_tokens": 1019042336 }, { "epoch": 0.6452863215803951, "grad_norm": 0.8933626413345337, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10322, "tokens_per_second_per_gpu": 10071.26, "total_tokens": 1019140186 }, { "epoch": 0.6453488372093024, "grad_norm": 0.8882331252098083, "learning_rate": 2e-05, "loss": 0.5967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10323, "tokens_per_second_per_gpu": 10527.36, "total_tokens": 1019236624 }, { "epoch": 0.6454113528382096, "grad_norm": 0.9058912992477417, "learning_rate": 2e-05, "loss": 0.5783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10324, "tokens_per_second_per_gpu": 8912.84, "total_tokens": 1019325986 }, { "epoch": 0.6454738684671167, "grad_norm": 0.9015977382659912, "learning_rate": 2e-05, "loss": 0.5984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10325, "tokens_per_second_per_gpu": 9052.72, "total_tokens": 1019415048 }, { "epoch": 0.645536384096024, "grad_norm": 1.0142170190811157, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10326, "tokens_per_second_per_gpu": 10222.32, "total_tokens": 1019509360 }, { "epoch": 0.6455988997249312, "grad_norm": 0.9102093577384949, "learning_rate": 2e-05, "loss": 0.6549, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10327, "tokens_per_second_per_gpu": 10768.66, "total_tokens": 1019606801 }, { "epoch": 0.6456614153538385, "grad_norm": 0.85361647605896, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10328, "tokens_per_second_per_gpu": 10907.02, "total_tokens": 1019708791 }, { "epoch": 0.6457239309827457, "grad_norm": 0.9034430980682373, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10329, "tokens_per_second_per_gpu": 10285.81, "total_tokens": 1019806162 }, { "epoch": 0.645786446611653, "grad_norm": 0.9973139762878418, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10330, "tokens_per_second_per_gpu": 10294.43, "total_tokens": 1019897712 }, { "epoch": 0.6458489622405601, "grad_norm": 0.880916953086853, "learning_rate": 2e-05, "loss": 0.6105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10331, "tokens_per_second_per_gpu": 9918.54, "total_tokens": 1019997691 }, { "epoch": 0.6459114778694673, "grad_norm": 0.8883991241455078, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10332, "tokens_per_second_per_gpu": 10000.58, "total_tokens": 1020095656 }, { "epoch": 0.6459739934983746, "grad_norm": 0.863332211971283, "learning_rate": 2e-05, "loss": 0.6017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10333, "tokens_per_second_per_gpu": 10714.96, "total_tokens": 1020191697 }, { "epoch": 0.6460365091272818, "grad_norm": 0.9305469989776611, "learning_rate": 2e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10334, "tokens_per_second_per_gpu": 10484.11, "total_tokens": 1020288850 }, { "epoch": 0.6460990247561891, "grad_norm": 0.8975393772125244, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10335, "tokens_per_second_per_gpu": 10642.62, "total_tokens": 1020386746 }, { "epoch": 0.6461615403850963, "grad_norm": 0.9096490144729614, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10336, "tokens_per_second_per_gpu": 10293.39, "total_tokens": 1020481234 }, { "epoch": 0.6462240560140035, "grad_norm": 0.8739703297615051, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10337, "tokens_per_second_per_gpu": 10228.59, "total_tokens": 1020581061 }, { "epoch": 0.6462865716429107, "grad_norm": 0.8771292567253113, "learning_rate": 2e-05, "loss": 0.5875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10338, "tokens_per_second_per_gpu": 9183.93, "total_tokens": 1020675947 }, { "epoch": 0.646349087271818, "grad_norm": 0.9182335138320923, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10339, "tokens_per_second_per_gpu": 11138.62, "total_tokens": 1020775557 }, { "epoch": 0.6464116029007252, "grad_norm": 0.9061798453330994, "learning_rate": 2e-05, "loss": 0.5939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10340, "tokens_per_second_per_gpu": 9324.18, "total_tokens": 1020870651 }, { "epoch": 0.6464741185296324, "grad_norm": 0.876841127872467, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10341, "tokens_per_second_per_gpu": 10440.43, "total_tokens": 1020966402 }, { "epoch": 0.6465366341585397, "grad_norm": 0.8871237635612488, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10342, "tokens_per_second_per_gpu": 10004.32, "total_tokens": 1021060793 }, { "epoch": 0.6465991497874468, "grad_norm": 0.9299411177635193, "learning_rate": 2e-05, "loss": 0.6536, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10343, "tokens_per_second_per_gpu": 10724.56, "total_tokens": 1021160116 }, { "epoch": 0.6466616654163541, "grad_norm": 0.9009923934936523, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10344, "tokens_per_second_per_gpu": 10740.85, "total_tokens": 1021254200 }, { "epoch": 0.6467241810452613, "grad_norm": 0.9473881721496582, "learning_rate": 2e-05, "loss": 0.6608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10345, "tokens_per_second_per_gpu": 10719.82, "total_tokens": 1021353633 }, { "epoch": 0.6467866966741685, "grad_norm": 0.9278183579444885, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10346, "tokens_per_second_per_gpu": 10633.43, "total_tokens": 1021453607 }, { "epoch": 0.6468492123030758, "grad_norm": 0.9080653190612793, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10347, "tokens_per_second_per_gpu": 10907.17, "total_tokens": 1021549620 }, { "epoch": 0.646911727931983, "grad_norm": 0.8938696384429932, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10348, "tokens_per_second_per_gpu": 10135.67, "total_tokens": 1021650446 }, { "epoch": 0.6469742435608902, "grad_norm": 0.9013009667396545, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10349, "tokens_per_second_per_gpu": 9646.22, "total_tokens": 1021740641 }, { "epoch": 0.6470367591897974, "grad_norm": 0.8821783065795898, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10350, "tokens_per_second_per_gpu": 10464.44, "total_tokens": 1021840111 }, { "epoch": 0.6470992748187047, "grad_norm": 0.888098418712616, "learning_rate": 2e-05, "loss": 0.6423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10351, "tokens_per_second_per_gpu": 10108.36, "total_tokens": 1021940285 }, { "epoch": 0.6471617904476119, "grad_norm": 0.9327171444892883, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10352, "tokens_per_second_per_gpu": 9774.24, "total_tokens": 1022034756 }, { "epoch": 0.6472243060765192, "grad_norm": 0.8785784244537354, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10353, "tokens_per_second_per_gpu": 10624.01, "total_tokens": 1022136288 }, { "epoch": 0.6472868217054264, "grad_norm": 0.9119632840156555, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10354, "tokens_per_second_per_gpu": 11377.27, "total_tokens": 1022238207 }, { "epoch": 0.6473493373343335, "grad_norm": 0.8806191086769104, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10355, "tokens_per_second_per_gpu": 10174.14, "total_tokens": 1022337490 }, { "epoch": 0.6474118529632408, "grad_norm": 0.9368528723716736, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10356, "tokens_per_second_per_gpu": 10331.5, "total_tokens": 1022435830 }, { "epoch": 0.647474368592148, "grad_norm": 0.9582546949386597, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10357, "tokens_per_second_per_gpu": 10807.08, "total_tokens": 1022533800 }, { "epoch": 0.6475368842210553, "grad_norm": 0.8837193250656128, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10358, "tokens_per_second_per_gpu": 9240.13, "total_tokens": 1022627309 }, { "epoch": 0.6475993998499625, "grad_norm": 0.9165983200073242, "learning_rate": 2e-05, "loss": 0.6732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10359, "tokens_per_second_per_gpu": 10364.59, "total_tokens": 1022725696 }, { "epoch": 0.6476619154788698, "grad_norm": 0.8766468167304993, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10360, "tokens_per_second_per_gpu": 10559.73, "total_tokens": 1022821707 }, { "epoch": 0.647724431107777, "grad_norm": 0.9426659941673279, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10361, "tokens_per_second_per_gpu": 10421.79, "total_tokens": 1022922768 }, { "epoch": 0.6477869467366841, "grad_norm": 0.9117684960365295, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10362, "tokens_per_second_per_gpu": 10852.5, "total_tokens": 1023019230 }, { "epoch": 0.6478494623655914, "grad_norm": 0.8885912299156189, "learning_rate": 2e-05, "loss": 0.5914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10363, "tokens_per_second_per_gpu": 10217.33, "total_tokens": 1023109998 }, { "epoch": 0.6479119779944986, "grad_norm": 0.9044308662414551, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10364, "tokens_per_second_per_gpu": 10189.8, "total_tokens": 1023205188 }, { "epoch": 0.6479744936234059, "grad_norm": 0.8853316903114319, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10365, "tokens_per_second_per_gpu": 9684.54, "total_tokens": 1023300901 }, { "epoch": 0.6480370092523131, "grad_norm": 0.9271658658981323, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10366, "tokens_per_second_per_gpu": 9931.06, "total_tokens": 1023394333 }, { "epoch": 0.6480995248812204, "grad_norm": 0.9193664789199829, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10367, "tokens_per_second_per_gpu": 10933.54, "total_tokens": 1023491446 }, { "epoch": 0.6481620405101275, "grad_norm": 0.8611575961112976, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10368, "tokens_per_second_per_gpu": 10261.71, "total_tokens": 1023593359 }, { "epoch": 0.6482245561390347, "grad_norm": 0.8881500363349915, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10369, "tokens_per_second_per_gpu": 9796.64, "total_tokens": 1023690353 }, { "epoch": 0.648287071767942, "grad_norm": 0.9295154213905334, "learning_rate": 2e-05, "loss": 0.7184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10370, "tokens_per_second_per_gpu": 10331.34, "total_tokens": 1023788461 }, { "epoch": 0.6483495873968492, "grad_norm": 0.8972650170326233, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10371, "tokens_per_second_per_gpu": 10810.34, "total_tokens": 1023883298 }, { "epoch": 0.6484121030257565, "grad_norm": 0.8923614025115967, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10372, "tokens_per_second_per_gpu": 11115.9, "total_tokens": 1023981739 }, { "epoch": 0.6484746186546637, "grad_norm": 0.881975531578064, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10373, "tokens_per_second_per_gpu": 10080.59, "total_tokens": 1024075545 }, { "epoch": 0.6485371342835708, "grad_norm": 0.9211488366127014, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10374, "tokens_per_second_per_gpu": 10940.44, "total_tokens": 1024172485 }, { "epoch": 0.6485996499124781, "grad_norm": 0.909895122051239, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10375, "tokens_per_second_per_gpu": 10717.84, "total_tokens": 1024270799 }, { "epoch": 0.6486621655413853, "grad_norm": 0.9438392519950867, "learning_rate": 2e-05, "loss": 0.6647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10376, "tokens_per_second_per_gpu": 9906.72, "total_tokens": 1024364308 }, { "epoch": 0.6487246811702926, "grad_norm": 0.9274046421051025, "learning_rate": 2e-05, "loss": 0.6603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10377, "tokens_per_second_per_gpu": 9709.62, "total_tokens": 1024460983 }, { "epoch": 0.6487871967991998, "grad_norm": 0.9073293209075928, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10378, "tokens_per_second_per_gpu": 10099.35, "total_tokens": 1024557101 }, { "epoch": 0.6488497124281071, "grad_norm": 0.8806548118591309, "learning_rate": 2e-05, "loss": 0.6355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10379, "tokens_per_second_per_gpu": 10585.78, "total_tokens": 1024654408 }, { "epoch": 0.6489122280570142, "grad_norm": 0.8716807961463928, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10380, "tokens_per_second_per_gpu": 10320.39, "total_tokens": 1024753287 }, { "epoch": 0.6489747436859215, "grad_norm": 0.8808560371398926, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10381, "tokens_per_second_per_gpu": 10724.25, "total_tokens": 1024851869 }, { "epoch": 0.6490372593148287, "grad_norm": 0.8796229362487793, "learning_rate": 2e-05, "loss": 0.6374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10382, "tokens_per_second_per_gpu": 10594.48, "total_tokens": 1024951678 }, { "epoch": 0.6490997749437359, "grad_norm": 0.9195794463157654, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10383, "tokens_per_second_per_gpu": 10017.72, "total_tokens": 1025044899 }, { "epoch": 0.6491622905726432, "grad_norm": 0.896199643611908, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10384, "tokens_per_second_per_gpu": 10434.49, "total_tokens": 1025144354 }, { "epoch": 0.6492248062015504, "grad_norm": 0.9407181739807129, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10385, "tokens_per_second_per_gpu": 10564.08, "total_tokens": 1025239712 }, { "epoch": 0.6492873218304576, "grad_norm": 0.9012520909309387, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10386, "tokens_per_second_per_gpu": 10917.42, "total_tokens": 1025339025 }, { "epoch": 0.6493498374593648, "grad_norm": 0.9332303404808044, "learning_rate": 2e-05, "loss": 0.5816, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10387, "tokens_per_second_per_gpu": 9228.14, "total_tokens": 1025424945 }, { "epoch": 0.649412353088272, "grad_norm": 0.9280288815498352, "learning_rate": 2e-05, "loss": 0.6462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10388, "tokens_per_second_per_gpu": 10222.25, "total_tokens": 1025520122 }, { "epoch": 0.6494748687171793, "grad_norm": 0.89773029088974, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10389, "tokens_per_second_per_gpu": 9999.19, "total_tokens": 1025619196 }, { "epoch": 0.6495373843460865, "grad_norm": 0.9229170680046082, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10390, "tokens_per_second_per_gpu": 10444.78, "total_tokens": 1025716605 }, { "epoch": 0.6495998999749938, "grad_norm": 0.8848072290420532, "learning_rate": 2e-05, "loss": 0.5889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10391, "tokens_per_second_per_gpu": 10180.77, "total_tokens": 1025811717 }, { "epoch": 0.6496624156039009, "grad_norm": 0.9134213924407959, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10392, "tokens_per_second_per_gpu": 10035.62, "total_tokens": 1025906626 }, { "epoch": 0.6497249312328082, "grad_norm": 0.8701199889183044, "learning_rate": 2e-05, "loss": 0.573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10393, "tokens_per_second_per_gpu": 10962.34, "total_tokens": 1026004714 }, { "epoch": 0.6497874468617154, "grad_norm": 0.8842881917953491, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10394, "tokens_per_second_per_gpu": 10476.0, "total_tokens": 1026104158 }, { "epoch": 0.6498499624906227, "grad_norm": 0.9496554732322693, "learning_rate": 2e-05, "loss": 0.5987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10395, "tokens_per_second_per_gpu": 10304.5, "total_tokens": 1026199947 }, { "epoch": 0.6499124781195299, "grad_norm": 0.937710702419281, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10396, "tokens_per_second_per_gpu": 9734.29, "total_tokens": 1026294336 }, { "epoch": 0.6499749937484371, "grad_norm": 0.9106544256210327, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10397, "tokens_per_second_per_gpu": 10329.17, "total_tokens": 1026390201 }, { "epoch": 0.6500375093773444, "grad_norm": 0.8465027809143066, "learning_rate": 2e-05, "loss": 0.5908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10398, "tokens_per_second_per_gpu": 10889.28, "total_tokens": 1026488807 }, { "epoch": 0.6501000250062515, "grad_norm": 0.8546819090843201, "learning_rate": 2e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10399, "tokens_per_second_per_gpu": 10661.98, "total_tokens": 1026588820 }, { "epoch": 0.6501625406351588, "grad_norm": 0.9328698515892029, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10400, "tokens_per_second_per_gpu": 9759.68, "total_tokens": 1026684526 }, { "epoch": 0.650225056264066, "grad_norm": 0.8892040848731995, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10401, "tokens_per_second_per_gpu": 9836.74, "total_tokens": 1026783172 }, { "epoch": 0.6502875718929733, "grad_norm": 0.8870048522949219, "learning_rate": 2e-05, "loss": 0.6694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10402, "tokens_per_second_per_gpu": 11188.35, "total_tokens": 1026887274 }, { "epoch": 0.6503500875218805, "grad_norm": 0.9275473952293396, "learning_rate": 2e-05, "loss": 0.6545, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10403, "tokens_per_second_per_gpu": 11106.6, "total_tokens": 1026987198 }, { "epoch": 0.6504126031507877, "grad_norm": 0.8752787113189697, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10404, "tokens_per_second_per_gpu": 10957.65, "total_tokens": 1027089543 }, { "epoch": 0.6504751187796949, "grad_norm": 0.8439319729804993, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10405, "tokens_per_second_per_gpu": 10241.08, "total_tokens": 1027189426 }, { "epoch": 0.6505376344086021, "grad_norm": 0.875276505947113, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10406, "tokens_per_second_per_gpu": 10518.62, "total_tokens": 1027290987 }, { "epoch": 0.6506001500375094, "grad_norm": 0.89532870054245, "learning_rate": 2e-05, "loss": 0.6086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10407, "tokens_per_second_per_gpu": 10619.77, "total_tokens": 1027389549 }, { "epoch": 0.6506626656664166, "grad_norm": 0.8910169005393982, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10408, "tokens_per_second_per_gpu": 10387.72, "total_tokens": 1027487268 }, { "epoch": 0.6507251812953239, "grad_norm": 0.9473839402198792, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10409, "tokens_per_second_per_gpu": 10582.05, "total_tokens": 1027583690 }, { "epoch": 0.6507876969242311, "grad_norm": 0.8926498889923096, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10410, "tokens_per_second_per_gpu": 10953.9, "total_tokens": 1027681893 }, { "epoch": 0.6508502125531382, "grad_norm": 0.8673636317253113, "learning_rate": 2e-05, "loss": 0.5812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10411, "tokens_per_second_per_gpu": 10221.61, "total_tokens": 1027782894 }, { "epoch": 0.6509127281820455, "grad_norm": 0.9181534647941589, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10412, "tokens_per_second_per_gpu": 9575.96, "total_tokens": 1027877006 }, { "epoch": 0.6509752438109527, "grad_norm": 0.8775072693824768, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10413, "tokens_per_second_per_gpu": 9700.66, "total_tokens": 1027973762 }, { "epoch": 0.65103775943986, "grad_norm": 0.8951517939567566, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10414, "tokens_per_second_per_gpu": 10299.46, "total_tokens": 1028068052 }, { "epoch": 0.6511002750687672, "grad_norm": 0.8603281378746033, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10415, "tokens_per_second_per_gpu": 10078.31, "total_tokens": 1028166839 }, { "epoch": 0.6511627906976745, "grad_norm": 0.9028363823890686, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10416, "tokens_per_second_per_gpu": 11094.93, "total_tokens": 1028268322 }, { "epoch": 0.6512253063265816, "grad_norm": 0.9011664986610413, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10417, "tokens_per_second_per_gpu": 10821.82, "total_tokens": 1028367349 }, { "epoch": 0.6512878219554888, "grad_norm": 0.9102919697761536, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10418, "tokens_per_second_per_gpu": 10809.62, "total_tokens": 1028467317 }, { "epoch": 0.6513503375843961, "grad_norm": 0.93421870470047, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10419, "tokens_per_second_per_gpu": 9892.01, "total_tokens": 1028557961 }, { "epoch": 0.6514128532133033, "grad_norm": 0.856378436088562, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10420, "tokens_per_second_per_gpu": 11281.5, "total_tokens": 1028663291 }, { "epoch": 0.6514753688422106, "grad_norm": 0.8461247682571411, "learning_rate": 2e-05, "loss": 0.5884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10421, "tokens_per_second_per_gpu": 11023.15, "total_tokens": 1028763794 }, { "epoch": 0.6515378844711178, "grad_norm": 0.9344257116317749, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10422, "tokens_per_second_per_gpu": 10682.45, "total_tokens": 1028864700 }, { "epoch": 0.651600400100025, "grad_norm": 0.8937732577323914, "learning_rate": 2e-05, "loss": 0.6847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10423, "tokens_per_second_per_gpu": 11200.25, "total_tokens": 1028966642 }, { "epoch": 0.6516629157289322, "grad_norm": 0.9127314686775208, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10424, "tokens_per_second_per_gpu": 9402.65, "total_tokens": 1029062422 }, { "epoch": 0.6517254313578394, "grad_norm": 0.8801157474517822, "learning_rate": 2e-05, "loss": 0.6046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10425, "tokens_per_second_per_gpu": 9341.04, "total_tokens": 1029158675 }, { "epoch": 0.6517879469867467, "grad_norm": 0.9089124798774719, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10426, "tokens_per_second_per_gpu": 10832.06, "total_tokens": 1029259474 }, { "epoch": 0.6518504626156539, "grad_norm": 0.871502161026001, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10427, "tokens_per_second_per_gpu": 11047.9, "total_tokens": 1029362601 }, { "epoch": 0.6519129782445612, "grad_norm": 0.8841450214385986, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10428, "tokens_per_second_per_gpu": 10670.79, "total_tokens": 1029461136 }, { "epoch": 0.6519754938734683, "grad_norm": 0.9113218784332275, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10429, "tokens_per_second_per_gpu": 10328.24, "total_tokens": 1029557841 }, { "epoch": 0.6520380095023756, "grad_norm": 0.9097399711608887, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10430, "tokens_per_second_per_gpu": 10618.39, "total_tokens": 1029656469 }, { "epoch": 0.6521005251312828, "grad_norm": 0.8768936991691589, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10431, "tokens_per_second_per_gpu": 10658.72, "total_tokens": 1029754784 }, { "epoch": 0.65216304076019, "grad_norm": 0.8931080102920532, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10432, "tokens_per_second_per_gpu": 11008.12, "total_tokens": 1029855549 }, { "epoch": 0.6522255563890973, "grad_norm": 0.8969259858131409, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10433, "tokens_per_second_per_gpu": 10187.28, "total_tokens": 1029956010 }, { "epoch": 0.6522880720180045, "grad_norm": 0.8877400159835815, "learning_rate": 2e-05, "loss": 0.6016, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10434, "tokens_per_second_per_gpu": 11195.58, "total_tokens": 1030058233 }, { "epoch": 0.6523505876469117, "grad_norm": 0.8859819769859314, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10435, "tokens_per_second_per_gpu": 10121.28, "total_tokens": 1030156834 }, { "epoch": 0.6524131032758189, "grad_norm": 0.887622058391571, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10436, "tokens_per_second_per_gpu": 10666.64, "total_tokens": 1030257461 }, { "epoch": 0.6524756189047262, "grad_norm": 0.8913078308105469, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10437, "tokens_per_second_per_gpu": 10601.15, "total_tokens": 1030356301 }, { "epoch": 0.6525381345336334, "grad_norm": 0.9205443859100342, "learning_rate": 2e-05, "loss": 0.6455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10438, "tokens_per_second_per_gpu": 10631.72, "total_tokens": 1030455310 }, { "epoch": 0.6526006501625407, "grad_norm": 0.8743315935134888, "learning_rate": 2e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10439, "tokens_per_second_per_gpu": 11296.84, "total_tokens": 1030557780 }, { "epoch": 0.6526631657914479, "grad_norm": 0.9960190057754517, "learning_rate": 2e-05, "loss": 0.6049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10440, "tokens_per_second_per_gpu": 10907.77, "total_tokens": 1030656118 }, { "epoch": 0.6527256814203551, "grad_norm": 0.8937311768531799, "learning_rate": 2e-05, "loss": 0.5818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10441, "tokens_per_second_per_gpu": 9813.85, "total_tokens": 1030748446 }, { "epoch": 0.6527881970492623, "grad_norm": 0.8486611247062683, "learning_rate": 2e-05, "loss": 0.6087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10442, "tokens_per_second_per_gpu": 10370.56, "total_tokens": 1030849160 }, { "epoch": 0.6528507126781695, "grad_norm": 0.9079113602638245, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10443, "tokens_per_second_per_gpu": 10291.62, "total_tokens": 1030948658 }, { "epoch": 0.6529132283070768, "grad_norm": 0.868583083152771, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10444, "tokens_per_second_per_gpu": 10642.91, "total_tokens": 1031050585 }, { "epoch": 0.652975743935984, "grad_norm": 0.8896238803863525, "learning_rate": 2e-05, "loss": 0.6588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10445, "tokens_per_second_per_gpu": 10897.15, "total_tokens": 1031151217 }, { "epoch": 0.6530382595648913, "grad_norm": 0.8692699074745178, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10446, "tokens_per_second_per_gpu": 10157.14, "total_tokens": 1031248684 }, { "epoch": 0.6531007751937985, "grad_norm": 0.857332706451416, "learning_rate": 2e-05, "loss": 0.6008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10447, "tokens_per_second_per_gpu": 11208.58, "total_tokens": 1031348161 }, { "epoch": 0.6531632908227056, "grad_norm": 0.8841230273246765, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10448, "tokens_per_second_per_gpu": 10294.63, "total_tokens": 1031444880 }, { "epoch": 0.6532258064516129, "grad_norm": 0.8782588839530945, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10449, "tokens_per_second_per_gpu": 10614.11, "total_tokens": 1031544053 }, { "epoch": 0.6532883220805201, "grad_norm": 0.9227426648139954, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10450, "tokens_per_second_per_gpu": 10985.58, "total_tokens": 1031645829 }, { "epoch": 0.6533508377094274, "grad_norm": 0.8694010972976685, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10451, "tokens_per_second_per_gpu": 11009.98, "total_tokens": 1031746854 }, { "epoch": 0.6534133533383346, "grad_norm": 0.8974953293800354, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10452, "tokens_per_second_per_gpu": 10409.85, "total_tokens": 1031844990 }, { "epoch": 0.6534758689672419, "grad_norm": 0.9352082014083862, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10453, "tokens_per_second_per_gpu": 10069.83, "total_tokens": 1031941786 }, { "epoch": 0.653538384596149, "grad_norm": 0.8868229389190674, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10454, "tokens_per_second_per_gpu": 10997.68, "total_tokens": 1032041105 }, { "epoch": 0.6536009002250562, "grad_norm": 0.9021306037902832, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10455, "tokens_per_second_per_gpu": 11196.25, "total_tokens": 1032140477 }, { "epoch": 0.6536634158539635, "grad_norm": 0.9125391840934753, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10456, "tokens_per_second_per_gpu": 10341.83, "total_tokens": 1032238314 }, { "epoch": 0.6537259314828707, "grad_norm": 0.8769202828407288, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10457, "tokens_per_second_per_gpu": 14349.74, "total_tokens": 1032340750 }, { "epoch": 0.653788447111778, "grad_norm": 0.9048409461975098, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10458, "tokens_per_second_per_gpu": 10523.61, "total_tokens": 1032440745 }, { "epoch": 0.6538509627406852, "grad_norm": 0.8843181133270264, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10459, "tokens_per_second_per_gpu": 10755.94, "total_tokens": 1032543027 }, { "epoch": 0.6539134783695923, "grad_norm": 0.8835592865943909, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10460, "tokens_per_second_per_gpu": 10666.86, "total_tokens": 1032645412 }, { "epoch": 0.6539759939984996, "grad_norm": 0.8817274570465088, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10461, "tokens_per_second_per_gpu": 11123.54, "total_tokens": 1032744670 }, { "epoch": 0.6540385096274068, "grad_norm": 0.8918861746788025, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10462, "tokens_per_second_per_gpu": 10499.39, "total_tokens": 1032839833 }, { "epoch": 0.6541010252563141, "grad_norm": 0.8862540125846863, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10463, "tokens_per_second_per_gpu": 10540.93, "total_tokens": 1032936987 }, { "epoch": 0.6541635408852213, "grad_norm": 0.9222052693367004, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10464, "tokens_per_second_per_gpu": 10020.0, "total_tokens": 1033029717 }, { "epoch": 0.6542260565141286, "grad_norm": 0.9379024505615234, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10465, "tokens_per_second_per_gpu": 10758.36, "total_tokens": 1033129358 }, { "epoch": 0.6542885721430357, "grad_norm": 0.9199205040931702, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10466, "tokens_per_second_per_gpu": 10754.11, "total_tokens": 1033231735 }, { "epoch": 0.654351087771943, "grad_norm": 0.8714631199836731, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10467, "tokens_per_second_per_gpu": 10255.44, "total_tokens": 1033332451 }, { "epoch": 0.6544136034008502, "grad_norm": 0.9176265597343445, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10468, "tokens_per_second_per_gpu": 11134.96, "total_tokens": 1033431201 }, { "epoch": 0.6544761190297574, "grad_norm": 0.8853354454040527, "learning_rate": 2e-05, "loss": 0.59, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10469, "tokens_per_second_per_gpu": 10810.75, "total_tokens": 1033531172 }, { "epoch": 0.6545386346586647, "grad_norm": 0.9395289421081543, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10470, "tokens_per_second_per_gpu": 9568.25, "total_tokens": 1033625412 }, { "epoch": 0.6546011502875719, "grad_norm": 0.9165586829185486, "learning_rate": 2e-05, "loss": 0.6749, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10471, "tokens_per_second_per_gpu": 10498.45, "total_tokens": 1033727071 }, { "epoch": 0.6546636659164791, "grad_norm": 0.8987449407577515, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10472, "tokens_per_second_per_gpu": 10458.61, "total_tokens": 1033824641 }, { "epoch": 0.6547261815453863, "grad_norm": 0.9012412428855896, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10473, "tokens_per_second_per_gpu": 9870.46, "total_tokens": 1033921188 }, { "epoch": 0.6547886971742936, "grad_norm": 0.8505908250808716, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10474, "tokens_per_second_per_gpu": 11075.68, "total_tokens": 1034024456 }, { "epoch": 0.6548512128032008, "grad_norm": 0.9101012349128723, "learning_rate": 2e-05, "loss": 0.6487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10475, "tokens_per_second_per_gpu": 11494.87, "total_tokens": 1034125243 }, { "epoch": 0.654913728432108, "grad_norm": 0.8871002197265625, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10476, "tokens_per_second_per_gpu": 10322.84, "total_tokens": 1034223659 }, { "epoch": 0.6549762440610153, "grad_norm": 0.8784370422363281, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10477, "tokens_per_second_per_gpu": 10368.72, "total_tokens": 1034319666 }, { "epoch": 0.6550387596899225, "grad_norm": 0.9131184220314026, "learning_rate": 2e-05, "loss": 0.6763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10478, "tokens_per_second_per_gpu": 11684.42, "total_tokens": 1034424771 }, { "epoch": 0.6551012753188297, "grad_norm": 0.8655809760093689, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10479, "tokens_per_second_per_gpu": 10864.28, "total_tokens": 1034524298 }, { "epoch": 0.6551637909477369, "grad_norm": 0.9314150214195251, "learning_rate": 2e-05, "loss": 0.5892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10480, "tokens_per_second_per_gpu": 10822.29, "total_tokens": 1034623233 }, { "epoch": 0.6552263065766442, "grad_norm": 0.8335174918174744, "learning_rate": 2e-05, "loss": 0.6056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10481, "tokens_per_second_per_gpu": 10061.0, "total_tokens": 1034723490 }, { "epoch": 0.6552888222055514, "grad_norm": 0.8646112084388733, "learning_rate": 2e-05, "loss": 0.5809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10482, "tokens_per_second_per_gpu": 10896.53, "total_tokens": 1034822667 }, { "epoch": 0.6553513378344586, "grad_norm": 0.8339067697525024, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10483, "tokens_per_second_per_gpu": 10968.39, "total_tokens": 1034923781 }, { "epoch": 0.6554138534633659, "grad_norm": 0.8900202512741089, "learning_rate": 2e-05, "loss": 0.6666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10484, "tokens_per_second_per_gpu": 10837.8, "total_tokens": 1035026377 }, { "epoch": 0.655476369092273, "grad_norm": 0.923985481262207, "learning_rate": 2e-05, "loss": 0.5821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10485, "tokens_per_second_per_gpu": 9502.52, "total_tokens": 1035118200 }, { "epoch": 0.6555388847211803, "grad_norm": 0.8737357258796692, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10486, "tokens_per_second_per_gpu": 11004.97, "total_tokens": 1035214806 }, { "epoch": 0.6556014003500875, "grad_norm": 0.9170578122138977, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10487, "tokens_per_second_per_gpu": 9639.66, "total_tokens": 1035307920 }, { "epoch": 0.6556639159789948, "grad_norm": 0.8460748791694641, "learning_rate": 2e-05, "loss": 0.563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10488, "tokens_per_second_per_gpu": 10307.7, "total_tokens": 1035406465 }, { "epoch": 0.655726431607902, "grad_norm": 1.0275472402572632, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10489, "tokens_per_second_per_gpu": 10406.52, "total_tokens": 1035504251 }, { "epoch": 0.6557889472368092, "grad_norm": 0.902146577835083, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10490, "tokens_per_second_per_gpu": 10946.97, "total_tokens": 1035604714 }, { "epoch": 0.6558514628657164, "grad_norm": 0.9480287432670593, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10491, "tokens_per_second_per_gpu": 10189.28, "total_tokens": 1035703046 }, { "epoch": 0.6559139784946236, "grad_norm": 0.964835524559021, "learning_rate": 2e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10492, "tokens_per_second_per_gpu": 11022.4, "total_tokens": 1035808705 }, { "epoch": 0.6559764941235309, "grad_norm": 0.901963472366333, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10493, "tokens_per_second_per_gpu": 11507.87, "total_tokens": 1035913050 }, { "epoch": 0.6560390097524381, "grad_norm": 0.9589694142341614, "learning_rate": 2e-05, "loss": 0.6019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10494, "tokens_per_second_per_gpu": 10165.29, "total_tokens": 1036006786 }, { "epoch": 0.6561015253813454, "grad_norm": 0.91021329164505, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10495, "tokens_per_second_per_gpu": 10706.56, "total_tokens": 1036108493 }, { "epoch": 0.6561640410102526, "grad_norm": 0.9022516012191772, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10496, "tokens_per_second_per_gpu": 11407.76, "total_tokens": 1036212533 }, { "epoch": 0.6562265566391597, "grad_norm": 0.9068300724029541, "learning_rate": 2e-05, "loss": 0.6614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10497, "tokens_per_second_per_gpu": 11456.89, "total_tokens": 1036315191 }, { "epoch": 0.656289072268067, "grad_norm": 0.9668986201286316, "learning_rate": 2e-05, "loss": 0.6432, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10498, "tokens_per_second_per_gpu": 10655.27, "total_tokens": 1036413077 }, { "epoch": 0.6563515878969742, "grad_norm": 0.9211035966873169, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10499, "tokens_per_second_per_gpu": 10278.98, "total_tokens": 1036510524 }, { "epoch": 0.6564141035258815, "grad_norm": 0.8878738880157471, "learning_rate": 2e-05, "loss": 0.596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10500, "tokens_per_second_per_gpu": 9086.4, "total_tokens": 1036604026 }, { "epoch": 0.6564766191547887, "grad_norm": 0.9176825284957886, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10501, "tokens_per_second_per_gpu": 9986.45, "total_tokens": 1036700508 }, { "epoch": 0.656539134783696, "grad_norm": 0.8581414222717285, "learning_rate": 2e-05, "loss": 0.5768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10502, "tokens_per_second_per_gpu": 10850.37, "total_tokens": 1036799248 }, { "epoch": 0.6566016504126031, "grad_norm": 0.9437140822410583, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10503, "tokens_per_second_per_gpu": 10250.84, "total_tokens": 1036895917 }, { "epoch": 0.6566641660415103, "grad_norm": 0.9076623916625977, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10504, "tokens_per_second_per_gpu": 10567.26, "total_tokens": 1036995117 }, { "epoch": 0.6567266816704176, "grad_norm": 0.8855282068252563, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10505, "tokens_per_second_per_gpu": 11073.72, "total_tokens": 1037097517 }, { "epoch": 0.6567891972993248, "grad_norm": 0.9088974595069885, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10506, "tokens_per_second_per_gpu": 10926.32, "total_tokens": 1037198704 }, { "epoch": 0.6568517129282321, "grad_norm": 0.8685833215713501, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10507, "tokens_per_second_per_gpu": 11130.27, "total_tokens": 1037304906 }, { "epoch": 0.6569142285571393, "grad_norm": 0.9344123005867004, "learning_rate": 2e-05, "loss": 0.6499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10508, "tokens_per_second_per_gpu": 10096.68, "total_tokens": 1037406194 }, { "epoch": 0.6569767441860465, "grad_norm": 0.8799022436141968, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10509, "tokens_per_second_per_gpu": 10645.34, "total_tokens": 1037507854 }, { "epoch": 0.6570392598149537, "grad_norm": 0.9123061895370483, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10510, "tokens_per_second_per_gpu": 11086.85, "total_tokens": 1037607834 }, { "epoch": 0.657101775443861, "grad_norm": 0.8825539946556091, "learning_rate": 2e-05, "loss": 0.606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10511, "tokens_per_second_per_gpu": 10740.74, "total_tokens": 1037707735 }, { "epoch": 0.6571642910727682, "grad_norm": 0.8746334314346313, "learning_rate": 2e-05, "loss": 0.606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10512, "tokens_per_second_per_gpu": 10678.57, "total_tokens": 1037807414 }, { "epoch": 0.6572268067016754, "grad_norm": 0.9029841423034668, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10513, "tokens_per_second_per_gpu": 10297.69, "total_tokens": 1037900786 }, { "epoch": 0.6572893223305827, "grad_norm": 0.9044237732887268, "learning_rate": 2e-05, "loss": 0.6622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10514, "tokens_per_second_per_gpu": 10449.68, "total_tokens": 1038000504 }, { "epoch": 0.6573518379594899, "grad_norm": 0.8369779586791992, "learning_rate": 2e-05, "loss": 0.6226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10515, "tokens_per_second_per_gpu": 10462.03, "total_tokens": 1038102920 }, { "epoch": 0.6574143535883971, "grad_norm": 0.880678117275238, "learning_rate": 2e-05, "loss": 0.6699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10516, "tokens_per_second_per_gpu": 10830.23, "total_tokens": 1038198314 }, { "epoch": 0.6574768692173043, "grad_norm": 0.8919503092765808, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10517, "tokens_per_second_per_gpu": 10207.01, "total_tokens": 1038295601 }, { "epoch": 0.6575393848462115, "grad_norm": 0.9261577725410461, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10518, "tokens_per_second_per_gpu": 9794.77, "total_tokens": 1038389343 }, { "epoch": 0.6576019004751188, "grad_norm": 0.902506411075592, "learning_rate": 2e-05, "loss": 0.7246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10519, "tokens_per_second_per_gpu": 10879.69, "total_tokens": 1038491261 }, { "epoch": 0.657664416104026, "grad_norm": 0.9091198444366455, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10520, "tokens_per_second_per_gpu": 10405.96, "total_tokens": 1038591328 }, { "epoch": 0.6577269317329333, "grad_norm": 0.9047966599464417, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10521, "tokens_per_second_per_gpu": 10136.0, "total_tokens": 1038683502 }, { "epoch": 0.6577894473618404, "grad_norm": 0.9400819540023804, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10522, "tokens_per_second_per_gpu": 10486.57, "total_tokens": 1038779521 }, { "epoch": 0.6578519629907477, "grad_norm": 0.8897594213485718, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10523, "tokens_per_second_per_gpu": 10475.71, "total_tokens": 1038880017 }, { "epoch": 0.6579144786196549, "grad_norm": 0.875819742679596, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10524, "tokens_per_second_per_gpu": 10721.32, "total_tokens": 1038982773 }, { "epoch": 0.6579769942485622, "grad_norm": 0.8475925922393799, "learning_rate": 2e-05, "loss": 0.6004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10525, "tokens_per_second_per_gpu": 10010.29, "total_tokens": 1039080212 }, { "epoch": 0.6580395098774694, "grad_norm": 0.8784693479537964, "learning_rate": 2e-05, "loss": 0.6405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10526, "tokens_per_second_per_gpu": 10577.31, "total_tokens": 1039179433 }, { "epoch": 0.6581020255063766, "grad_norm": 0.8760021328926086, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10527, "tokens_per_second_per_gpu": 10705.56, "total_tokens": 1039281791 }, { "epoch": 0.6581645411352838, "grad_norm": 0.881598711013794, "learning_rate": 2e-05, "loss": 0.6048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10528, "tokens_per_second_per_gpu": 11017.95, "total_tokens": 1039381136 }, { "epoch": 0.658227056764191, "grad_norm": 0.9227729439735413, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10529, "tokens_per_second_per_gpu": 11683.44, "total_tokens": 1039483264 }, { "epoch": 0.6582895723930983, "grad_norm": 0.9020652174949646, "learning_rate": 2e-05, "loss": 0.5986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10530, "tokens_per_second_per_gpu": 10448.16, "total_tokens": 1039578660 }, { "epoch": 0.6583520880220055, "grad_norm": 0.8918860554695129, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10531, "tokens_per_second_per_gpu": 9796.94, "total_tokens": 1039677564 }, { "epoch": 0.6584146036509128, "grad_norm": 0.886415958404541, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10532, "tokens_per_second_per_gpu": 10922.39, "total_tokens": 1039778099 }, { "epoch": 0.65847711927982, "grad_norm": 0.8785861134529114, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10533, "tokens_per_second_per_gpu": 10939.58, "total_tokens": 1039880817 }, { "epoch": 0.6585396349087271, "grad_norm": 0.8876049518585205, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10534, "tokens_per_second_per_gpu": 10964.33, "total_tokens": 1039982460 }, { "epoch": 0.6586021505376344, "grad_norm": 0.8586876392364502, "learning_rate": 2e-05, "loss": 0.5952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10535, "tokens_per_second_per_gpu": 9909.66, "total_tokens": 1040079348 }, { "epoch": 0.6586646661665416, "grad_norm": 0.9038999676704407, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10536, "tokens_per_second_per_gpu": 10136.38, "total_tokens": 1040175579 }, { "epoch": 0.6587271817954489, "grad_norm": 0.8419688940048218, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10537, "tokens_per_second_per_gpu": 9897.0, "total_tokens": 1040272641 }, { "epoch": 0.6587896974243561, "grad_norm": 0.9106158018112183, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10538, "tokens_per_second_per_gpu": 10397.17, "total_tokens": 1040374447 }, { "epoch": 0.6588522130532634, "grad_norm": 0.8817551136016846, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10539, "tokens_per_second_per_gpu": 10770.44, "total_tokens": 1040473685 }, { "epoch": 0.6589147286821705, "grad_norm": 0.9132806062698364, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10540, "tokens_per_second_per_gpu": 11173.96, "total_tokens": 1040578741 }, { "epoch": 0.6589772443110777, "grad_norm": 0.8630906939506531, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10541, "tokens_per_second_per_gpu": 10940.72, "total_tokens": 1040680061 }, { "epoch": 0.659039759939985, "grad_norm": 0.8766029477119446, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10542, "tokens_per_second_per_gpu": 10717.54, "total_tokens": 1040780073 }, { "epoch": 0.6591022755688922, "grad_norm": 0.9522898197174072, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10543, "tokens_per_second_per_gpu": 9544.8, "total_tokens": 1040873896 }, { "epoch": 0.6591647911977995, "grad_norm": 0.9053961038589478, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10544, "tokens_per_second_per_gpu": 10678.22, "total_tokens": 1040972601 }, { "epoch": 0.6592273068267067, "grad_norm": 0.8725191950798035, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10545, "tokens_per_second_per_gpu": 10819.17, "total_tokens": 1041073559 }, { "epoch": 0.6592898224556138, "grad_norm": 1.0404119491577148, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10546, "tokens_per_second_per_gpu": 10346.63, "total_tokens": 1041169276 }, { "epoch": 0.6593523380845211, "grad_norm": 0.9122238755226135, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10547, "tokens_per_second_per_gpu": 9826.68, "total_tokens": 1041270628 }, { "epoch": 0.6594148537134283, "grad_norm": 0.8457951545715332, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10548, "tokens_per_second_per_gpu": 10801.75, "total_tokens": 1041373796 }, { "epoch": 0.6594773693423356, "grad_norm": 0.905921459197998, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10549, "tokens_per_second_per_gpu": 10088.88, "total_tokens": 1041469513 }, { "epoch": 0.6595398849712428, "grad_norm": 0.8929216265678406, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10550, "tokens_per_second_per_gpu": 10539.81, "total_tokens": 1041568084 }, { "epoch": 0.6596024006001501, "grad_norm": 0.900083065032959, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10551, "tokens_per_second_per_gpu": 10974.99, "total_tokens": 1041666077 }, { "epoch": 0.6596649162290573, "grad_norm": 0.8759707808494568, "learning_rate": 2e-05, "loss": 0.5947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10552, "tokens_per_second_per_gpu": 10778.06, "total_tokens": 1041764758 }, { "epoch": 0.6597274318579645, "grad_norm": 0.8896105289459229, "learning_rate": 2e-05, "loss": 0.6576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10553, "tokens_per_second_per_gpu": 10438.37, "total_tokens": 1041866246 }, { "epoch": 0.6597899474868717, "grad_norm": 0.8835515379905701, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10554, "tokens_per_second_per_gpu": 10633.59, "total_tokens": 1041967639 }, { "epoch": 0.6598524631157789, "grad_norm": 0.9161809682846069, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10555, "tokens_per_second_per_gpu": 10714.5, "total_tokens": 1042066639 }, { "epoch": 0.6599149787446862, "grad_norm": 0.9220142364501953, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10556, "tokens_per_second_per_gpu": 10060.99, "total_tokens": 1042163078 }, { "epoch": 0.6599774943735934, "grad_norm": 0.9237912893295288, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10557, "tokens_per_second_per_gpu": 10836.62, "total_tokens": 1042262522 }, { "epoch": 0.6600400100025007, "grad_norm": 0.8952894806861877, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10558, "tokens_per_second_per_gpu": 11473.84, "total_tokens": 1042361610 }, { "epoch": 0.6601025256314078, "grad_norm": 0.8737086653709412, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10559, "tokens_per_second_per_gpu": 10271.96, "total_tokens": 1042462468 }, { "epoch": 0.660165041260315, "grad_norm": 0.9292991757392883, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10560, "tokens_per_second_per_gpu": 9805.31, "total_tokens": 1042557537 }, { "epoch": 0.6602275568892223, "grad_norm": 0.8836817741394043, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10561, "tokens_per_second_per_gpu": 9549.73, "total_tokens": 1042654643 }, { "epoch": 0.6602900725181295, "grad_norm": 0.9283995032310486, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10562, "tokens_per_second_per_gpu": 10951.0, "total_tokens": 1042757141 }, { "epoch": 0.6603525881470368, "grad_norm": 0.9034772515296936, "learning_rate": 2e-05, "loss": 0.6348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10563, "tokens_per_second_per_gpu": 10226.72, "total_tokens": 1042854509 }, { "epoch": 0.660415103775944, "grad_norm": 0.8688820600509644, "learning_rate": 2e-05, "loss": 0.593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10564, "tokens_per_second_per_gpu": 10618.17, "total_tokens": 1042952645 }, { "epoch": 0.6604776194048512, "grad_norm": 0.8778933882713318, "learning_rate": 2e-05, "loss": 0.6473, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10565, "tokens_per_second_per_gpu": 10764.25, "total_tokens": 1043053061 }, { "epoch": 0.6605401350337584, "grad_norm": 0.8884859085083008, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10566, "tokens_per_second_per_gpu": 10900.08, "total_tokens": 1043151183 }, { "epoch": 0.6606026506626657, "grad_norm": 0.8776367902755737, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10567, "tokens_per_second_per_gpu": 10721.14, "total_tokens": 1043253043 }, { "epoch": 0.6606651662915729, "grad_norm": 0.8811625838279724, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10568, "tokens_per_second_per_gpu": 10702.72, "total_tokens": 1043353680 }, { "epoch": 0.6607276819204801, "grad_norm": 0.8933497071266174, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10569, "tokens_per_second_per_gpu": 10689.11, "total_tokens": 1043454065 }, { "epoch": 0.6607901975493874, "grad_norm": 0.9069175720214844, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10570, "tokens_per_second_per_gpu": 9561.67, "total_tokens": 1043546158 }, { "epoch": 0.6608527131782945, "grad_norm": 0.8844406604766846, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10571, "tokens_per_second_per_gpu": 10603.69, "total_tokens": 1043642836 }, { "epoch": 0.6609152288072018, "grad_norm": 0.9244716167449951, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10572, "tokens_per_second_per_gpu": 10517.66, "total_tokens": 1043741367 }, { "epoch": 0.660977744436109, "grad_norm": 0.8908151388168335, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10573, "tokens_per_second_per_gpu": 11281.33, "total_tokens": 1043844535 }, { "epoch": 0.6610402600650163, "grad_norm": 0.8718096017837524, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10574, "tokens_per_second_per_gpu": 11333.57, "total_tokens": 1043950141 }, { "epoch": 0.6611027756939235, "grad_norm": 0.8868069648742676, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10575, "tokens_per_second_per_gpu": 10784.32, "total_tokens": 1044051353 }, { "epoch": 0.6611652913228308, "grad_norm": 0.8854786157608032, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10576, "tokens_per_second_per_gpu": 10680.81, "total_tokens": 1044151809 }, { "epoch": 0.6612278069517379, "grad_norm": 0.9207128882408142, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10577, "tokens_per_second_per_gpu": 10289.52, "total_tokens": 1044250124 }, { "epoch": 0.6612903225806451, "grad_norm": 0.8849967122077942, "learning_rate": 2e-05, "loss": 0.6355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10578, "tokens_per_second_per_gpu": 10098.7, "total_tokens": 1044351232 }, { "epoch": 0.6613528382095524, "grad_norm": 0.9071990847587585, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10579, "tokens_per_second_per_gpu": 10315.21, "total_tokens": 1044451252 }, { "epoch": 0.6614153538384596, "grad_norm": 0.8980393409729004, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10580, "tokens_per_second_per_gpu": 10484.2, "total_tokens": 1044550775 }, { "epoch": 0.6614778694673669, "grad_norm": 0.9069865942001343, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10581, "tokens_per_second_per_gpu": 10590.83, "total_tokens": 1044651493 }, { "epoch": 0.6615403850962741, "grad_norm": 0.8674399256706238, "learning_rate": 2e-05, "loss": 0.5979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10582, "tokens_per_second_per_gpu": 11411.57, "total_tokens": 1044752448 }, { "epoch": 0.6616029007251812, "grad_norm": 0.8676560521125793, "learning_rate": 2e-05, "loss": 0.6168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10583, "tokens_per_second_per_gpu": 10989.75, "total_tokens": 1044855479 }, { "epoch": 0.6616654163540885, "grad_norm": 0.8762600421905518, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10584, "tokens_per_second_per_gpu": 11314.76, "total_tokens": 1044957526 }, { "epoch": 0.6617279319829957, "grad_norm": 0.8957862854003906, "learning_rate": 2e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10585, "tokens_per_second_per_gpu": 10425.07, "total_tokens": 1045054238 }, { "epoch": 0.661790447611903, "grad_norm": 0.9074167013168335, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10586, "tokens_per_second_per_gpu": 11088.83, "total_tokens": 1045159308 }, { "epoch": 0.6618529632408102, "grad_norm": 0.892579197883606, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10587, "tokens_per_second_per_gpu": 10156.47, "total_tokens": 1045258886 }, { "epoch": 0.6619154788697175, "grad_norm": 0.8552541136741638, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10588, "tokens_per_second_per_gpu": 10442.96, "total_tokens": 1045360067 }, { "epoch": 0.6619779944986247, "grad_norm": 0.8856149315834045, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10589, "tokens_per_second_per_gpu": 10427.79, "total_tokens": 1045459082 }, { "epoch": 0.6620405101275318, "grad_norm": 0.8955144286155701, "learning_rate": 2e-05, "loss": 0.6757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10590, "tokens_per_second_per_gpu": 11017.08, "total_tokens": 1045562953 }, { "epoch": 0.6621030257564391, "grad_norm": 0.8763320446014404, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10591, "tokens_per_second_per_gpu": 11086.87, "total_tokens": 1045665558 }, { "epoch": 0.6621655413853463, "grad_norm": 0.8774299025535583, "learning_rate": 2e-05, "loss": 0.6394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10592, "tokens_per_second_per_gpu": 11420.64, "total_tokens": 1045769756 }, { "epoch": 0.6622280570142536, "grad_norm": 0.8886688947677612, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10593, "tokens_per_second_per_gpu": 10397.51, "total_tokens": 1045865298 }, { "epoch": 0.6622905726431608, "grad_norm": 0.8646227717399597, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10594, "tokens_per_second_per_gpu": 10244.27, "total_tokens": 1045960426 }, { "epoch": 0.6623530882720681, "grad_norm": 0.8425052165985107, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10595, "tokens_per_second_per_gpu": 10723.03, "total_tokens": 1046059090 }, { "epoch": 0.6624156039009752, "grad_norm": 0.8811507821083069, "learning_rate": 2e-05, "loss": 0.6554, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10596, "tokens_per_second_per_gpu": 11428.99, "total_tokens": 1046162788 }, { "epoch": 0.6624781195298824, "grad_norm": 0.8675703406333923, "learning_rate": 2e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10597, "tokens_per_second_per_gpu": 10078.18, "total_tokens": 1046263658 }, { "epoch": 0.6625406351587897, "grad_norm": 0.890975832939148, "learning_rate": 2e-05, "loss": 0.6654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10598, "tokens_per_second_per_gpu": 11146.93, "total_tokens": 1046367449 }, { "epoch": 0.6626031507876969, "grad_norm": 0.8822245001792908, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10599, "tokens_per_second_per_gpu": 10390.29, "total_tokens": 1046465507 }, { "epoch": 0.6626656664166042, "grad_norm": 0.8403632640838623, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10600, "tokens_per_second_per_gpu": 10930.84, "total_tokens": 1046567617 }, { "epoch": 0.6627281820455114, "grad_norm": 0.8696951270103455, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10601, "tokens_per_second_per_gpu": 10983.93, "total_tokens": 1046668174 }, { "epoch": 0.6627906976744186, "grad_norm": 0.8905613422393799, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10602, "tokens_per_second_per_gpu": 10814.17, "total_tokens": 1046769998 }, { "epoch": 0.6628532133033258, "grad_norm": 0.886734664440155, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10603, "tokens_per_second_per_gpu": 11002.21, "total_tokens": 1046873594 }, { "epoch": 0.662915728932233, "grad_norm": 0.8684006333351135, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10604, "tokens_per_second_per_gpu": 10670.87, "total_tokens": 1046973693 }, { "epoch": 0.6629782445611403, "grad_norm": 1.0423170328140259, "learning_rate": 2e-05, "loss": 0.6056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10605, "tokens_per_second_per_gpu": 10146.92, "total_tokens": 1047071577 }, { "epoch": 0.6630407601900475, "grad_norm": 0.9683254957199097, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10606, "tokens_per_second_per_gpu": 10059.26, "total_tokens": 1047167475 }, { "epoch": 0.6631032758189548, "grad_norm": 0.937951385974884, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10607, "tokens_per_second_per_gpu": 11228.95, "total_tokens": 1047268799 }, { "epoch": 0.6631657914478619, "grad_norm": 0.8606305122375488, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10608, "tokens_per_second_per_gpu": 10665.91, "total_tokens": 1047369438 }, { "epoch": 0.6632283070767692, "grad_norm": 0.9356392025947571, "learning_rate": 2e-05, "loss": 0.6982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10609, "tokens_per_second_per_gpu": 10510.74, "total_tokens": 1047467747 }, { "epoch": 0.6632908227056764, "grad_norm": 0.8496146202087402, "learning_rate": 2e-05, "loss": 0.5657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10610, "tokens_per_second_per_gpu": 10614.58, "total_tokens": 1047565521 }, { "epoch": 0.6633533383345837, "grad_norm": 0.8970472812652588, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10611, "tokens_per_second_per_gpu": 11318.24, "total_tokens": 1047671721 }, { "epoch": 0.6634158539634909, "grad_norm": 0.8867670893669128, "learning_rate": 2e-05, "loss": 0.6564, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10612, "tokens_per_second_per_gpu": 10474.74, "total_tokens": 1047774044 }, { "epoch": 0.6634783695923981, "grad_norm": 0.8782774209976196, "learning_rate": 2e-05, "loss": 0.5989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10613, "tokens_per_second_per_gpu": 10517.25, "total_tokens": 1047872508 }, { "epoch": 0.6635408852213053, "grad_norm": 0.9060965776443481, "learning_rate": 2e-05, "loss": 0.6286, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10614, "tokens_per_second_per_gpu": 10528.37, "total_tokens": 1047972068 }, { "epoch": 0.6636034008502125, "grad_norm": 0.882423996925354, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10615, "tokens_per_second_per_gpu": 10699.69, "total_tokens": 1048074747 }, { "epoch": 0.6636659164791198, "grad_norm": 0.9107435941696167, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10616, "tokens_per_second_per_gpu": 10488.05, "total_tokens": 1048170244 }, { "epoch": 0.663728432108027, "grad_norm": 0.8927881121635437, "learning_rate": 2e-05, "loss": 0.6046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10617, "tokens_per_second_per_gpu": 10404.86, "total_tokens": 1048268678 }, { "epoch": 0.6637909477369343, "grad_norm": 0.9013478755950928, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10618, "tokens_per_second_per_gpu": 10506.9, "total_tokens": 1048371256 }, { "epoch": 0.6638534633658415, "grad_norm": 0.8971379995346069, "learning_rate": 2e-05, "loss": 0.5966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10619, "tokens_per_second_per_gpu": 10226.92, "total_tokens": 1048469061 }, { "epoch": 0.6639159789947486, "grad_norm": 0.8871796727180481, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10620, "tokens_per_second_per_gpu": 10367.89, "total_tokens": 1048572000 }, { "epoch": 0.6639784946236559, "grad_norm": 0.9063327312469482, "learning_rate": 2e-05, "loss": 0.6426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10621, "tokens_per_second_per_gpu": 10205.79, "total_tokens": 1048670598 }, { "epoch": 0.6640410102525631, "grad_norm": 0.8646423816680908, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10622, "tokens_per_second_per_gpu": 10685.14, "total_tokens": 1048773643 }, { "epoch": 0.6641035258814704, "grad_norm": 0.9014936685562134, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10623, "tokens_per_second_per_gpu": 10101.56, "total_tokens": 1048869969 }, { "epoch": 0.6641660415103776, "grad_norm": 0.8966944813728333, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10624, "tokens_per_second_per_gpu": 11048.57, "total_tokens": 1048974554 }, { "epoch": 0.6642285571392849, "grad_norm": 0.8949708938598633, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10625, "tokens_per_second_per_gpu": 11013.29, "total_tokens": 1049075521 }, { "epoch": 0.6642910727681921, "grad_norm": 0.8630325794219971, "learning_rate": 2e-05, "loss": 0.5863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10626, "tokens_per_second_per_gpu": 10985.08, "total_tokens": 1049174350 }, { "epoch": 0.6643535883970992, "grad_norm": 0.9301555752754211, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10627, "tokens_per_second_per_gpu": 10294.01, "total_tokens": 1049269574 }, { "epoch": 0.6644161040260065, "grad_norm": 0.8750957250595093, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10628, "tokens_per_second_per_gpu": 10596.02, "total_tokens": 1049369163 }, { "epoch": 0.6644786196549137, "grad_norm": 0.8809004426002502, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10629, "tokens_per_second_per_gpu": 10728.03, "total_tokens": 1049470212 }, { "epoch": 0.664541135283821, "grad_norm": 0.8671125769615173, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10630, "tokens_per_second_per_gpu": 11094.58, "total_tokens": 1049568490 }, { "epoch": 0.6646036509127282, "grad_norm": 0.8873682618141174, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10631, "tokens_per_second_per_gpu": 10734.46, "total_tokens": 1049666515 }, { "epoch": 0.6646661665416355, "grad_norm": 0.9202589392662048, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10632, "tokens_per_second_per_gpu": 10430.27, "total_tokens": 1049762385 }, { "epoch": 0.6647286821705426, "grad_norm": 0.9133999943733215, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10633, "tokens_per_second_per_gpu": 10581.29, "total_tokens": 1049856147 }, { "epoch": 0.6647911977994498, "grad_norm": 0.8951857686042786, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10634, "tokens_per_second_per_gpu": 10267.06, "total_tokens": 1049954626 }, { "epoch": 0.6648537134283571, "grad_norm": 0.8702861070632935, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10635, "tokens_per_second_per_gpu": 10300.38, "total_tokens": 1050053598 }, { "epoch": 0.6649162290572643, "grad_norm": 0.8892465829849243, "learning_rate": 2e-05, "loss": 0.5974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10636, "tokens_per_second_per_gpu": 10572.1, "total_tokens": 1050152230 }, { "epoch": 0.6649787446861716, "grad_norm": 0.9011358022689819, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10637, "tokens_per_second_per_gpu": 10828.83, "total_tokens": 1050250348 }, { "epoch": 0.6650412603150788, "grad_norm": 0.8811028599739075, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10638, "tokens_per_second_per_gpu": 10267.37, "total_tokens": 1050350286 }, { "epoch": 0.665103775943986, "grad_norm": 0.8620156049728394, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10639, "tokens_per_second_per_gpu": 10734.02, "total_tokens": 1050448590 }, { "epoch": 0.6651662915728932, "grad_norm": 0.888832688331604, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10640, "tokens_per_second_per_gpu": 10711.04, "total_tokens": 1050550502 }, { "epoch": 0.6652288072018004, "grad_norm": 0.8793691992759705, "learning_rate": 2e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10641, "tokens_per_second_per_gpu": 10388.82, "total_tokens": 1050651390 }, { "epoch": 0.6652913228307077, "grad_norm": 0.8869732618331909, "learning_rate": 2e-05, "loss": 0.6283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10642, "tokens_per_second_per_gpu": 10472.22, "total_tokens": 1050749486 }, { "epoch": 0.6653538384596149, "grad_norm": 0.8787962794303894, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10643, "tokens_per_second_per_gpu": 10853.34, "total_tokens": 1050849771 }, { "epoch": 0.6654163540885222, "grad_norm": 0.8780680894851685, "learning_rate": 2e-05, "loss": 0.6695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10644, "tokens_per_second_per_gpu": 10873.93, "total_tokens": 1050948228 }, { "epoch": 0.6654788697174293, "grad_norm": 0.8880980014801025, "learning_rate": 2e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10645, "tokens_per_second_per_gpu": 10654.74, "total_tokens": 1051043424 }, { "epoch": 0.6655413853463366, "grad_norm": 0.8840354681015015, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10646, "tokens_per_second_per_gpu": 10776.18, "total_tokens": 1051141421 }, { "epoch": 0.6656039009752438, "grad_norm": 0.9357896447181702, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10647, "tokens_per_second_per_gpu": 9862.07, "total_tokens": 1051239871 }, { "epoch": 0.665666416604151, "grad_norm": 0.9194230437278748, "learning_rate": 2e-05, "loss": 0.5877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10648, "tokens_per_second_per_gpu": 8722.83, "total_tokens": 1051330236 }, { "epoch": 0.6657289322330583, "grad_norm": 0.9290064573287964, "learning_rate": 2e-05, "loss": 0.6053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10649, "tokens_per_second_per_gpu": 10251.64, "total_tokens": 1051427978 }, { "epoch": 0.6657914478619655, "grad_norm": 0.8681589961051941, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10650, "tokens_per_second_per_gpu": 10782.26, "total_tokens": 1051524780 }, { "epoch": 0.6658539634908727, "grad_norm": 0.8725833296775818, "learning_rate": 2e-05, "loss": 0.6338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10651, "tokens_per_second_per_gpu": 10623.6, "total_tokens": 1051623864 }, { "epoch": 0.6659164791197799, "grad_norm": 0.927638828754425, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10652, "tokens_per_second_per_gpu": 9385.76, "total_tokens": 1051718784 }, { "epoch": 0.6659789947486872, "grad_norm": 0.9142342805862427, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10653, "tokens_per_second_per_gpu": 9796.09, "total_tokens": 1051814614 }, { "epoch": 0.6660415103775944, "grad_norm": 0.9236589670181274, "learning_rate": 2e-05, "loss": 0.5734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10654, "tokens_per_second_per_gpu": 9544.33, "total_tokens": 1051909614 }, { "epoch": 0.6661040260065016, "grad_norm": 0.914528489112854, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10655, "tokens_per_second_per_gpu": 9909.67, "total_tokens": 1052006090 }, { "epoch": 0.6661665416354089, "grad_norm": 0.897022008895874, "learning_rate": 2e-05, "loss": 0.6714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10656, "tokens_per_second_per_gpu": 9426.08, "total_tokens": 1052102460 }, { "epoch": 0.666229057264316, "grad_norm": 0.9023743271827698, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10657, "tokens_per_second_per_gpu": 10061.42, "total_tokens": 1052201446 }, { "epoch": 0.6662915728932233, "grad_norm": 0.8728657960891724, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10658, "tokens_per_second_per_gpu": 10842.83, "total_tokens": 1052301713 }, { "epoch": 0.6663540885221305, "grad_norm": 0.9109356999397278, "learning_rate": 2e-05, "loss": 0.6618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10659, "tokens_per_second_per_gpu": 10460.86, "total_tokens": 1052398514 }, { "epoch": 0.6664166041510378, "grad_norm": 0.91585773229599, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10660, "tokens_per_second_per_gpu": 9647.69, "total_tokens": 1052495997 }, { "epoch": 0.666479119779945, "grad_norm": 0.9233642816543579, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10661, "tokens_per_second_per_gpu": 10634.05, "total_tokens": 1052595361 }, { "epoch": 0.6665416354088523, "grad_norm": 0.8792650103569031, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10662, "tokens_per_second_per_gpu": 10144.65, "total_tokens": 1052693335 }, { "epoch": 0.6666041510377594, "grad_norm": 0.9247052073478699, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10663, "tokens_per_second_per_gpu": 9575.57, "total_tokens": 1052789187 }, { "epoch": 0.6666666666666666, "grad_norm": 0.8633689880371094, "learning_rate": 2e-05, "loss": 0.5861, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10664, "tokens_per_second_per_gpu": 10170.99, "total_tokens": 1052886995 }, { "epoch": 0.6667291822955739, "grad_norm": 0.8891928195953369, "learning_rate": 2e-05, "loss": 0.6775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10665, "tokens_per_second_per_gpu": 10942.89, "total_tokens": 1052990827 }, { "epoch": 0.6667916979244811, "grad_norm": 0.8789278268814087, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10666, "tokens_per_second_per_gpu": 10360.46, "total_tokens": 1053090583 }, { "epoch": 0.6668542135533884, "grad_norm": 0.8725203275680542, "learning_rate": 2e-05, "loss": 0.6718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10667, "tokens_per_second_per_gpu": 10966.1, "total_tokens": 1053194141 }, { "epoch": 0.6669167291822956, "grad_norm": 0.8851938843727112, "learning_rate": 2e-05, "loss": 0.5899, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10668, "tokens_per_second_per_gpu": 9022.83, "total_tokens": 1053286312 }, { "epoch": 0.6669792448112029, "grad_norm": 0.8971627950668335, "learning_rate": 2e-05, "loss": 0.6831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10669, "tokens_per_second_per_gpu": 9949.94, "total_tokens": 1053384096 }, { "epoch": 0.66704176044011, "grad_norm": 0.8679808974266052, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10670, "tokens_per_second_per_gpu": 10307.47, "total_tokens": 1053485566 }, { "epoch": 0.6671042760690172, "grad_norm": 0.8699679374694824, "learning_rate": 2e-05, "loss": 0.5987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10671, "tokens_per_second_per_gpu": 10730.37, "total_tokens": 1053585402 }, { "epoch": 0.6671667916979245, "grad_norm": 0.8581676483154297, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10672, "tokens_per_second_per_gpu": 10178.68, "total_tokens": 1053684103 }, { "epoch": 0.6672293073268317, "grad_norm": 0.9464673399925232, "learning_rate": 2e-05, "loss": 0.6817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10673, "tokens_per_second_per_gpu": 10698.52, "total_tokens": 1053783556 }, { "epoch": 0.667291822955739, "grad_norm": 0.9737187027931213, "learning_rate": 2e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10674, "tokens_per_second_per_gpu": 10166.81, "total_tokens": 1053878922 }, { "epoch": 0.6673543385846462, "grad_norm": 0.9145589470863342, "learning_rate": 2e-05, "loss": 0.6459, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10675, "tokens_per_second_per_gpu": 10579.71, "total_tokens": 1053979336 }, { "epoch": 0.6674168542135533, "grad_norm": 0.9029332399368286, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10676, "tokens_per_second_per_gpu": 11061.32, "total_tokens": 1054077032 }, { "epoch": 0.6674793698424606, "grad_norm": 0.8752316236495972, "learning_rate": 2e-05, "loss": 0.6049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10677, "tokens_per_second_per_gpu": 10599.3, "total_tokens": 1054173831 }, { "epoch": 0.6675418854713678, "grad_norm": 0.9189480543136597, "learning_rate": 2e-05, "loss": 0.6089, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10678, "tokens_per_second_per_gpu": 10907.97, "total_tokens": 1054275012 }, { "epoch": 0.6676044011002751, "grad_norm": 0.876404881477356, "learning_rate": 2e-05, "loss": 0.6549, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10679, "tokens_per_second_per_gpu": 11183.6, "total_tokens": 1054374534 }, { "epoch": 0.6676669167291823, "grad_norm": 0.8928532004356384, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10680, "tokens_per_second_per_gpu": 9909.52, "total_tokens": 1054473679 }, { "epoch": 0.6677294323580896, "grad_norm": 0.9017208218574524, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10681, "tokens_per_second_per_gpu": 10797.11, "total_tokens": 1054573764 }, { "epoch": 0.6677919479869967, "grad_norm": 0.8753914833068848, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10682, "tokens_per_second_per_gpu": 10719.98, "total_tokens": 1054674628 }, { "epoch": 0.667854463615904, "grad_norm": 0.8639863133430481, "learning_rate": 2e-05, "loss": 0.5801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10683, "tokens_per_second_per_gpu": 10118.91, "total_tokens": 1054772719 }, { "epoch": 0.6679169792448112, "grad_norm": 0.9136433601379395, "learning_rate": 2e-05, "loss": 0.6789, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10684, "tokens_per_second_per_gpu": 11450.03, "total_tokens": 1054877919 }, { "epoch": 0.6679794948737184, "grad_norm": 0.8900628685951233, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10685, "tokens_per_second_per_gpu": 10931.26, "total_tokens": 1054979471 }, { "epoch": 0.6680420105026257, "grad_norm": 0.8884760737419128, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10686, "tokens_per_second_per_gpu": 10880.65, "total_tokens": 1055080831 }, { "epoch": 0.6681045261315329, "grad_norm": 0.8938655257225037, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10687, "tokens_per_second_per_gpu": 10982.96, "total_tokens": 1055181300 }, { "epoch": 0.6681670417604401, "grad_norm": 0.8663169145584106, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10688, "tokens_per_second_per_gpu": 11424.28, "total_tokens": 1055282516 }, { "epoch": 0.6682295573893473, "grad_norm": 0.8609714508056641, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10689, "tokens_per_second_per_gpu": 11149.38, "total_tokens": 1055384401 }, { "epoch": 0.6682920730182546, "grad_norm": 0.8955519199371338, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10690, "tokens_per_second_per_gpu": 10148.56, "total_tokens": 1055481108 }, { "epoch": 0.6683545886471618, "grad_norm": 0.8701267838478088, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10691, "tokens_per_second_per_gpu": 10724.1, "total_tokens": 1055583059 }, { "epoch": 0.668417104276069, "grad_norm": 0.9166762828826904, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10692, "tokens_per_second_per_gpu": 9833.03, "total_tokens": 1055680259 }, { "epoch": 0.6684796199049763, "grad_norm": 0.9127050638198853, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10693, "tokens_per_second_per_gpu": 10733.92, "total_tokens": 1055776092 }, { "epoch": 0.6685421355338834, "grad_norm": 0.9029021263122559, "learning_rate": 2e-05, "loss": 0.5908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10694, "tokens_per_second_per_gpu": 15500.49, "total_tokens": 1055868780 }, { "epoch": 0.6686046511627907, "grad_norm": 0.8782948851585388, "learning_rate": 2e-05, "loss": 0.603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10695, "tokens_per_second_per_gpu": 16748.93, "total_tokens": 1055966775 }, { "epoch": 0.6686671667916979, "grad_norm": 0.8992340564727783, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10696, "tokens_per_second_per_gpu": 17635.24, "total_tokens": 1056064478 }, { "epoch": 0.6687296824206052, "grad_norm": 0.9012224078178406, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10697, "tokens_per_second_per_gpu": 18156.3, "total_tokens": 1056162129 }, { "epoch": 0.6687921980495124, "grad_norm": 0.8978195190429688, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10698, "tokens_per_second_per_gpu": 16545.95, "total_tokens": 1056258668 }, { "epoch": 0.6688547136784196, "grad_norm": 0.891904890537262, "learning_rate": 2e-05, "loss": 0.635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10699, "tokens_per_second_per_gpu": 18698.38, "total_tokens": 1056358875 }, { "epoch": 0.6689172293073268, "grad_norm": 0.8816952705383301, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10700, "tokens_per_second_per_gpu": 18470.12, "total_tokens": 1056461510 }, { "epoch": 0.668979744936234, "grad_norm": 0.8987732529640198, "learning_rate": 2e-05, "loss": 0.5934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10701, "tokens_per_second_per_gpu": 16266.69, "total_tokens": 1056555081 }, { "epoch": 0.6690422605651413, "grad_norm": 0.8998615741729736, "learning_rate": 2e-05, "loss": 0.6209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10702, "tokens_per_second_per_gpu": 17328.59, "total_tokens": 1056653350 }, { "epoch": 0.6691047761940485, "grad_norm": 0.8462374806404114, "learning_rate": 2e-05, "loss": 0.572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10703, "tokens_per_second_per_gpu": 16682.98, "total_tokens": 1056751470 }, { "epoch": 0.6691672918229558, "grad_norm": 0.8863589763641357, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10704, "tokens_per_second_per_gpu": 16618.28, "total_tokens": 1056849688 }, { "epoch": 0.669229807451863, "grad_norm": 0.8960819244384766, "learning_rate": 2e-05, "loss": 0.5881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10705, "tokens_per_second_per_gpu": 17509.82, "total_tokens": 1056946786 }, { "epoch": 0.6692923230807702, "grad_norm": 0.8908653259277344, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10706, "tokens_per_second_per_gpu": 14768.03, "total_tokens": 1057047054 }, { "epoch": 0.6693548387096774, "grad_norm": 0.8962915539741516, "learning_rate": 2e-05, "loss": 0.589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10707, "tokens_per_second_per_gpu": 14822.13, "total_tokens": 1057140370 }, { "epoch": 0.6694173543385846, "grad_norm": 0.9009056091308594, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10708, "tokens_per_second_per_gpu": 18037.06, "total_tokens": 1057238635 }, { "epoch": 0.6694798699674919, "grad_norm": 0.908501923084259, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10709, "tokens_per_second_per_gpu": 15979.13, "total_tokens": 1057334524 }, { "epoch": 0.6695423855963991, "grad_norm": 0.9105039238929749, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10710, "tokens_per_second_per_gpu": 16931.05, "total_tokens": 1057437181 }, { "epoch": 0.6696049012253064, "grad_norm": 0.9087932109832764, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10711, "tokens_per_second_per_gpu": 12694.2, "total_tokens": 1057539578 }, { "epoch": 0.6696674168542136, "grad_norm": 0.8756686449050903, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10712, "tokens_per_second_per_gpu": 11340.22, "total_tokens": 1057642475 }, { "epoch": 0.6697299324831207, "grad_norm": 0.8977320194244385, "learning_rate": 2e-05, "loss": 0.6423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10713, "tokens_per_second_per_gpu": 10778.76, "total_tokens": 1057740592 }, { "epoch": 0.669792448112028, "grad_norm": 0.9004193544387817, "learning_rate": 2e-05, "loss": 0.6762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10714, "tokens_per_second_per_gpu": 10946.88, "total_tokens": 1057843996 }, { "epoch": 0.6698549637409352, "grad_norm": 0.941196620464325, "learning_rate": 2e-05, "loss": 0.6798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10715, "tokens_per_second_per_gpu": 10467.96, "total_tokens": 1057944513 }, { "epoch": 0.6699174793698425, "grad_norm": 0.9036375880241394, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10716, "tokens_per_second_per_gpu": 10487.1, "total_tokens": 1058041710 }, { "epoch": 0.6699799949987497, "grad_norm": 0.8664828538894653, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10717, "tokens_per_second_per_gpu": 10504.99, "total_tokens": 1058139317 }, { "epoch": 0.670042510627657, "grad_norm": 0.8893115520477295, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10718, "tokens_per_second_per_gpu": 10127.46, "total_tokens": 1058238093 }, { "epoch": 0.6701050262565641, "grad_norm": 0.8968809247016907, "learning_rate": 2e-05, "loss": 0.6056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10719, "tokens_per_second_per_gpu": 10376.09, "total_tokens": 1058333316 }, { "epoch": 0.6701675418854713, "grad_norm": 0.9046705365180969, "learning_rate": 2e-05, "loss": 0.6561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10720, "tokens_per_second_per_gpu": 9747.13, "total_tokens": 1058430111 }, { "epoch": 0.6702300575143786, "grad_norm": 0.9361785650253296, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10721, "tokens_per_second_per_gpu": 10474.42, "total_tokens": 1058529262 }, { "epoch": 0.6702925731432858, "grad_norm": 0.9281249642372131, "learning_rate": 2e-05, "loss": 0.6604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10722, "tokens_per_second_per_gpu": 11253.18, "total_tokens": 1058630783 }, { "epoch": 0.6703550887721931, "grad_norm": 0.9052818417549133, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10723, "tokens_per_second_per_gpu": 11261.04, "total_tokens": 1058731407 }, { "epoch": 0.6704176044011003, "grad_norm": 0.8786386251449585, "learning_rate": 2e-05, "loss": 0.5979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10724, "tokens_per_second_per_gpu": 10450.19, "total_tokens": 1058830473 }, { "epoch": 0.6704801200300075, "grad_norm": 0.871979296207428, "learning_rate": 2e-05, "loss": 0.6413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10725, "tokens_per_second_per_gpu": 10449.69, "total_tokens": 1058931542 }, { "epoch": 0.6705426356589147, "grad_norm": 0.865662693977356, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10726, "tokens_per_second_per_gpu": 10788.47, "total_tokens": 1059027735 }, { "epoch": 0.6706051512878219, "grad_norm": 0.917332649230957, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10727, "tokens_per_second_per_gpu": 10842.13, "total_tokens": 1059129991 }, { "epoch": 0.6706676669167292, "grad_norm": 0.9134423136711121, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10728, "tokens_per_second_per_gpu": 9600.4, "total_tokens": 1059226877 }, { "epoch": 0.6707301825456364, "grad_norm": 0.8688138127326965, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10729, "tokens_per_second_per_gpu": 10256.3, "total_tokens": 1059326454 }, { "epoch": 0.6707926981745437, "grad_norm": 0.8915706276893616, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10730, "tokens_per_second_per_gpu": 10731.28, "total_tokens": 1059426593 }, { "epoch": 0.6708552138034508, "grad_norm": 0.9139304757118225, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10731, "tokens_per_second_per_gpu": 10698.64, "total_tokens": 1059526604 }, { "epoch": 0.6709177294323581, "grad_norm": 0.9322373867034912, "learning_rate": 2e-05, "loss": 0.5986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10732, "tokens_per_second_per_gpu": 9877.99, "total_tokens": 1059621736 }, { "epoch": 0.6709802450612653, "grad_norm": 1.0001556873321533, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10733, "tokens_per_second_per_gpu": 10223.14, "total_tokens": 1059719333 }, { "epoch": 0.6710427606901725, "grad_norm": 0.9130312204360962, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10734, "tokens_per_second_per_gpu": 10383.69, "total_tokens": 1059816071 }, { "epoch": 0.6711052763190798, "grad_norm": 0.9179033041000366, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10735, "tokens_per_second_per_gpu": 9679.95, "total_tokens": 1059910771 }, { "epoch": 0.671167791947987, "grad_norm": 0.8841503858566284, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10736, "tokens_per_second_per_gpu": 9645.27, "total_tokens": 1060006593 }, { "epoch": 0.6712303075768942, "grad_norm": 0.8869289755821228, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10737, "tokens_per_second_per_gpu": 9965.58, "total_tokens": 1060105713 }, { "epoch": 0.6712928232058014, "grad_norm": 0.8762810826301575, "learning_rate": 2e-05, "loss": 0.6009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10738, "tokens_per_second_per_gpu": 11454.74, "total_tokens": 1060206945 }, { "epoch": 0.6713553388347087, "grad_norm": 0.8429937958717346, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10739, "tokens_per_second_per_gpu": 10951.66, "total_tokens": 1060310284 }, { "epoch": 0.6714178544636159, "grad_norm": 0.9541416764259338, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10740, "tokens_per_second_per_gpu": 10422.49, "total_tokens": 1060412767 }, { "epoch": 0.6714803700925231, "grad_norm": 0.9632655382156372, "learning_rate": 2e-05, "loss": 0.5845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10741, "tokens_per_second_per_gpu": 10319.45, "total_tokens": 1060510584 }, { "epoch": 0.6715428857214304, "grad_norm": 0.9067928194999695, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10742, "tokens_per_second_per_gpu": 8969.81, "total_tokens": 1060603959 }, { "epoch": 0.6716054013503376, "grad_norm": 0.9023619890213013, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10743, "tokens_per_second_per_gpu": 10030.62, "total_tokens": 1060701591 }, { "epoch": 0.6716679169792448, "grad_norm": 0.9495009183883667, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10744, "tokens_per_second_per_gpu": 10055.72, "total_tokens": 1060797672 }, { "epoch": 0.671730432608152, "grad_norm": 0.9146192073822021, "learning_rate": 2e-05, "loss": 0.6809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10745, "tokens_per_second_per_gpu": 11146.69, "total_tokens": 1060899763 }, { "epoch": 0.6717929482370593, "grad_norm": 0.9101232886314392, "learning_rate": 2e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10746, "tokens_per_second_per_gpu": 10426.2, "total_tokens": 1061000916 }, { "epoch": 0.6718554638659665, "grad_norm": 0.9333688020706177, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10747, "tokens_per_second_per_gpu": 11005.11, "total_tokens": 1061101443 }, { "epoch": 0.6719179794948738, "grad_norm": 0.9202268123626709, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10748, "tokens_per_second_per_gpu": 10391.72, "total_tokens": 1061199715 }, { "epoch": 0.671980495123781, "grad_norm": 0.9241102933883667, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10749, "tokens_per_second_per_gpu": 10772.33, "total_tokens": 1061298926 }, { "epoch": 0.6720430107526881, "grad_norm": 0.8907175660133362, "learning_rate": 2e-05, "loss": 0.6048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10750, "tokens_per_second_per_gpu": 10239.62, "total_tokens": 1061396469 }, { "epoch": 0.6721055263815954, "grad_norm": 0.9487563371658325, "learning_rate": 2e-05, "loss": 0.6728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10751, "tokens_per_second_per_gpu": 10136.9, "total_tokens": 1061490965 }, { "epoch": 0.6721680420105026, "grad_norm": 0.9451466202735901, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10752, "tokens_per_second_per_gpu": 10881.6, "total_tokens": 1061591021 }, { "epoch": 0.6722305576394099, "grad_norm": 0.8877456188201904, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10753, "tokens_per_second_per_gpu": 10058.19, "total_tokens": 1061689345 }, { "epoch": 0.6722930732683171, "grad_norm": 0.9802790284156799, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10754, "tokens_per_second_per_gpu": 10546.87, "total_tokens": 1061788731 }, { "epoch": 0.6723555888972244, "grad_norm": 0.8812233209609985, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10755, "tokens_per_second_per_gpu": 11193.61, "total_tokens": 1061891449 }, { "epoch": 0.6724181045261315, "grad_norm": 0.8635625243186951, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10756, "tokens_per_second_per_gpu": 11134.92, "total_tokens": 1061994142 }, { "epoch": 0.6724806201550387, "grad_norm": 0.9098506569862366, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10757, "tokens_per_second_per_gpu": 10206.9, "total_tokens": 1062089246 }, { "epoch": 0.672543135783946, "grad_norm": 0.8479458093643188, "learning_rate": 2e-05, "loss": 0.6003, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10758, "tokens_per_second_per_gpu": 10659.47, "total_tokens": 1062191271 }, { "epoch": 0.6726056514128532, "grad_norm": 0.8634423017501831, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10759, "tokens_per_second_per_gpu": 10956.02, "total_tokens": 1062293863 }, { "epoch": 0.6726681670417605, "grad_norm": 0.8764210343360901, "learning_rate": 2e-05, "loss": 0.6639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10760, "tokens_per_second_per_gpu": 10016.38, "total_tokens": 1062394207 }, { "epoch": 0.6727306826706677, "grad_norm": 0.9010728001594543, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10761, "tokens_per_second_per_gpu": 10372.72, "total_tokens": 1062491461 }, { "epoch": 0.6727931982995748, "grad_norm": 0.9188465476036072, "learning_rate": 2e-05, "loss": 0.6052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10762, "tokens_per_second_per_gpu": 10315.08, "total_tokens": 1062589100 }, { "epoch": 0.6728557139284821, "grad_norm": 0.9077430367469788, "learning_rate": 2e-05, "loss": 0.6526, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10763, "tokens_per_second_per_gpu": 10106.66, "total_tokens": 1062687158 }, { "epoch": 0.6729182295573893, "grad_norm": 0.8933963775634766, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10764, "tokens_per_second_per_gpu": 10844.43, "total_tokens": 1062788051 }, { "epoch": 0.6729807451862966, "grad_norm": 0.8881203532218933, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10765, "tokens_per_second_per_gpu": 10532.15, "total_tokens": 1062888436 }, { "epoch": 0.6730432608152038, "grad_norm": 0.9175270199775696, "learning_rate": 2e-05, "loss": 0.6839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10766, "tokens_per_second_per_gpu": 10511.81, "total_tokens": 1062990903 }, { "epoch": 0.6731057764441111, "grad_norm": 0.9186117053031921, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10767, "tokens_per_second_per_gpu": 10014.79, "total_tokens": 1063086404 }, { "epoch": 0.6731682920730182, "grad_norm": 0.859076976776123, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10768, "tokens_per_second_per_gpu": 10677.12, "total_tokens": 1063184626 }, { "epoch": 0.6732308077019254, "grad_norm": 0.9042644500732422, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10769, "tokens_per_second_per_gpu": 10528.62, "total_tokens": 1063281201 }, { "epoch": 0.6732933233308327, "grad_norm": 0.8755762577056885, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10770, "tokens_per_second_per_gpu": 10500.9, "total_tokens": 1063379842 }, { "epoch": 0.6733558389597399, "grad_norm": 0.8697069883346558, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10771, "tokens_per_second_per_gpu": 11379.38, "total_tokens": 1063480548 }, { "epoch": 0.6734183545886472, "grad_norm": 0.88263338804245, "learning_rate": 2e-05, "loss": 0.6813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10772, "tokens_per_second_per_gpu": 11537.69, "total_tokens": 1063586634 }, { "epoch": 0.6734808702175544, "grad_norm": 0.926163375377655, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10773, "tokens_per_second_per_gpu": 10039.15, "total_tokens": 1063680877 }, { "epoch": 0.6735433858464616, "grad_norm": 0.9041322469711304, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10774, "tokens_per_second_per_gpu": 10264.04, "total_tokens": 1063780733 }, { "epoch": 0.6736059014753688, "grad_norm": 0.9544082880020142, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10775, "tokens_per_second_per_gpu": 10056.66, "total_tokens": 1063876280 }, { "epoch": 0.673668417104276, "grad_norm": 0.864457905292511, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10776, "tokens_per_second_per_gpu": 10585.3, "total_tokens": 1063978182 }, { "epoch": 0.6737309327331833, "grad_norm": 0.9453629851341248, "learning_rate": 2e-05, "loss": 0.5851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10777, "tokens_per_second_per_gpu": 10983.78, "total_tokens": 1064073667 }, { "epoch": 0.6737934483620905, "grad_norm": 0.900973379611969, "learning_rate": 2e-05, "loss": 0.6836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10778, "tokens_per_second_per_gpu": 10192.73, "total_tokens": 1064170549 }, { "epoch": 0.6738559639909978, "grad_norm": 0.876032292842865, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10779, "tokens_per_second_per_gpu": 11318.51, "total_tokens": 1064275024 }, { "epoch": 0.673918479619905, "grad_norm": 0.9204630851745605, "learning_rate": 2e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10780, "tokens_per_second_per_gpu": 10885.34, "total_tokens": 1064375353 }, { "epoch": 0.6739809952488122, "grad_norm": 0.8909255862236023, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10781, "tokens_per_second_per_gpu": 9586.49, "total_tokens": 1064469284 }, { "epoch": 0.6740435108777194, "grad_norm": 0.909468948841095, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10782, "tokens_per_second_per_gpu": 9668.44, "total_tokens": 1064565268 }, { "epoch": 0.6741060265066267, "grad_norm": 0.8706201910972595, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10783, "tokens_per_second_per_gpu": 10578.97, "total_tokens": 1064659207 }, { "epoch": 0.6741685421355339, "grad_norm": 0.8818085789680481, "learning_rate": 2e-05, "loss": 0.6356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10784, "tokens_per_second_per_gpu": 11364.47, "total_tokens": 1064762272 }, { "epoch": 0.6742310577644411, "grad_norm": 0.8799282908439636, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10785, "tokens_per_second_per_gpu": 10791.27, "total_tokens": 1064861220 }, { "epoch": 0.6742935733933484, "grad_norm": 0.866132915019989, "learning_rate": 2e-05, "loss": 0.5938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10786, "tokens_per_second_per_gpu": 10671.67, "total_tokens": 1064963337 }, { "epoch": 0.6743560890222555, "grad_norm": 0.9040821194648743, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10787, "tokens_per_second_per_gpu": 10613.87, "total_tokens": 1065062616 }, { "epoch": 0.6744186046511628, "grad_norm": 0.8851277828216553, "learning_rate": 2e-05, "loss": 0.6053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10788, "tokens_per_second_per_gpu": 9835.6, "total_tokens": 1065159962 }, { "epoch": 0.67448112028007, "grad_norm": 0.8727631568908691, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10789, "tokens_per_second_per_gpu": 11319.06, "total_tokens": 1065260514 }, { "epoch": 0.6745436359089773, "grad_norm": 0.8771693706512451, "learning_rate": 2e-05, "loss": 0.5953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10790, "tokens_per_second_per_gpu": 11099.92, "total_tokens": 1065357920 }, { "epoch": 0.6746061515378845, "grad_norm": 0.8580240607261658, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10791, "tokens_per_second_per_gpu": 10976.23, "total_tokens": 1065463419 }, { "epoch": 0.6746686671667917, "grad_norm": 0.9083096385002136, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10792, "tokens_per_second_per_gpu": 11019.22, "total_tokens": 1065563537 }, { "epoch": 0.6747311827956989, "grad_norm": 0.8858464360237122, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10793, "tokens_per_second_per_gpu": 10499.64, "total_tokens": 1065661733 }, { "epoch": 0.6747936984246061, "grad_norm": 0.8965510725975037, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10794, "tokens_per_second_per_gpu": 9965.14, "total_tokens": 1065756998 }, { "epoch": 0.6748562140535134, "grad_norm": 0.8690890669822693, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10795, "tokens_per_second_per_gpu": 11154.13, "total_tokens": 1065856587 }, { "epoch": 0.6749187296824206, "grad_norm": 0.890217125415802, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10796, "tokens_per_second_per_gpu": 10774.18, "total_tokens": 1065952128 }, { "epoch": 0.6749812453113279, "grad_norm": 0.8798078894615173, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10797, "tokens_per_second_per_gpu": 11313.54, "total_tokens": 1066052082 }, { "epoch": 0.6750437609402351, "grad_norm": 0.8703601956367493, "learning_rate": 2e-05, "loss": 0.6049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10798, "tokens_per_second_per_gpu": 10369.71, "total_tokens": 1066151650 }, { "epoch": 0.6751062765691422, "grad_norm": 0.8862954378128052, "learning_rate": 2e-05, "loss": 0.6018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10799, "tokens_per_second_per_gpu": 10470.5, "total_tokens": 1066249588 }, { "epoch": 0.6751687921980495, "grad_norm": 0.8751822710037231, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10800, "tokens_per_second_per_gpu": 10967.91, "total_tokens": 1066350239 }, { "epoch": 0.6752313078269567, "grad_norm": 0.8584031462669373, "learning_rate": 2e-05, "loss": 0.5829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10801, "tokens_per_second_per_gpu": 10490.54, "total_tokens": 1066445931 }, { "epoch": 0.675293823455864, "grad_norm": 0.9008505344390869, "learning_rate": 2e-05, "loss": 0.5951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10802, "tokens_per_second_per_gpu": 10412.89, "total_tokens": 1066544816 }, { "epoch": 0.6753563390847712, "grad_norm": 0.8549104332923889, "learning_rate": 2e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10803, "tokens_per_second_per_gpu": 10716.25, "total_tokens": 1066644446 }, { "epoch": 0.6754188547136785, "grad_norm": 0.9097585678100586, "learning_rate": 2e-05, "loss": 0.6206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10804, "tokens_per_second_per_gpu": 10320.23, "total_tokens": 1066743959 }, { "epoch": 0.6754813703425856, "grad_norm": 0.8840789794921875, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10805, "tokens_per_second_per_gpu": 10272.83, "total_tokens": 1066842576 }, { "epoch": 0.6755438859714928, "grad_norm": 0.8585575222969055, "learning_rate": 2e-05, "loss": 0.5939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10806, "tokens_per_second_per_gpu": 9971.25, "total_tokens": 1066936764 }, { "epoch": 0.6756064016004001, "grad_norm": 0.8822081685066223, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10807, "tokens_per_second_per_gpu": 11207.41, "total_tokens": 1067037686 }, { "epoch": 0.6756689172293073, "grad_norm": 0.8681420683860779, "learning_rate": 2e-05, "loss": 0.5609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10808, "tokens_per_second_per_gpu": 9884.95, "total_tokens": 1067132219 }, { "epoch": 0.6757314328582146, "grad_norm": 0.8696290850639343, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10809, "tokens_per_second_per_gpu": 10705.71, "total_tokens": 1067230198 }, { "epoch": 0.6757939484871218, "grad_norm": 0.936859667301178, "learning_rate": 2e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10810, "tokens_per_second_per_gpu": 10296.21, "total_tokens": 1067330002 }, { "epoch": 0.675856464116029, "grad_norm": 0.8780487775802612, "learning_rate": 2e-05, "loss": 0.6577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10811, "tokens_per_second_per_gpu": 10959.41, "total_tokens": 1067429504 }, { "epoch": 0.6759189797449362, "grad_norm": 0.883612871170044, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10812, "tokens_per_second_per_gpu": 10603.59, "total_tokens": 1067529307 }, { "epoch": 0.6759814953738434, "grad_norm": 0.8757772445678711, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10813, "tokens_per_second_per_gpu": 10795.36, "total_tokens": 1067627666 }, { "epoch": 0.6760440110027507, "grad_norm": 0.9057273268699646, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10814, "tokens_per_second_per_gpu": 10049.52, "total_tokens": 1067727261 }, { "epoch": 0.6761065266316579, "grad_norm": 0.8933088183403015, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10815, "tokens_per_second_per_gpu": 10120.54, "total_tokens": 1067826123 }, { "epoch": 0.6761690422605652, "grad_norm": 0.8913139700889587, "learning_rate": 2e-05, "loss": 0.6853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10816, "tokens_per_second_per_gpu": 10518.79, "total_tokens": 1067926316 }, { "epoch": 0.6762315578894724, "grad_norm": 0.8889584541320801, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10817, "tokens_per_second_per_gpu": 9936.2, "total_tokens": 1068023398 }, { "epoch": 0.6762940735183796, "grad_norm": 0.8829503655433655, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10818, "tokens_per_second_per_gpu": 11143.19, "total_tokens": 1068122748 }, { "epoch": 0.6763565891472868, "grad_norm": 0.8768774271011353, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10819, "tokens_per_second_per_gpu": 11082.22, "total_tokens": 1068224856 }, { "epoch": 0.676419104776194, "grad_norm": 0.888154149055481, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10820, "tokens_per_second_per_gpu": 10617.94, "total_tokens": 1068323988 }, { "epoch": 0.6764816204051013, "grad_norm": 0.9170620441436768, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10821, "tokens_per_second_per_gpu": 10670.57, "total_tokens": 1068422267 }, { "epoch": 0.6765441360340085, "grad_norm": 0.8567107915878296, "learning_rate": 2e-05, "loss": 0.5728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10822, "tokens_per_second_per_gpu": 9815.59, "total_tokens": 1068516367 }, { "epoch": 0.6766066516629158, "grad_norm": 0.86753249168396, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10823, "tokens_per_second_per_gpu": 10827.88, "total_tokens": 1068619901 }, { "epoch": 0.6766691672918229, "grad_norm": 0.8905871510505676, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10824, "tokens_per_second_per_gpu": 10597.85, "total_tokens": 1068720050 }, { "epoch": 0.6767316829207302, "grad_norm": 0.9176638722419739, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10825, "tokens_per_second_per_gpu": 10228.75, "total_tokens": 1068816307 }, { "epoch": 0.6767941985496374, "grad_norm": 0.9188492298126221, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10826, "tokens_per_second_per_gpu": 11276.38, "total_tokens": 1068917014 }, { "epoch": 0.6768567141785446, "grad_norm": 0.8729111552238464, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10827, "tokens_per_second_per_gpu": 11389.31, "total_tokens": 1069021544 }, { "epoch": 0.6769192298074519, "grad_norm": 0.9260186553001404, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10828, "tokens_per_second_per_gpu": 10757.25, "total_tokens": 1069123722 }, { "epoch": 0.6769817454363591, "grad_norm": 0.8856901526451111, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10829, "tokens_per_second_per_gpu": 11217.97, "total_tokens": 1069224366 }, { "epoch": 0.6770442610652663, "grad_norm": 0.867699921131134, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10830, "tokens_per_second_per_gpu": 11326.17, "total_tokens": 1069330672 }, { "epoch": 0.6771067766941735, "grad_norm": 0.9219883680343628, "learning_rate": 2e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10831, "tokens_per_second_per_gpu": 10223.05, "total_tokens": 1069430542 }, { "epoch": 0.6771692923230808, "grad_norm": 0.8847343921661377, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10832, "tokens_per_second_per_gpu": 9850.63, "total_tokens": 1069529338 }, { "epoch": 0.677231807951988, "grad_norm": 0.8680580258369446, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10833, "tokens_per_second_per_gpu": 10637.27, "total_tokens": 1069628473 }, { "epoch": 0.6772943235808953, "grad_norm": 0.9382153153419495, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10834, "tokens_per_second_per_gpu": 9949.2, "total_tokens": 1069723885 }, { "epoch": 0.6773568392098025, "grad_norm": 0.9031991362571716, "learning_rate": 2e-05, "loss": 0.6766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10835, "tokens_per_second_per_gpu": 10409.86, "total_tokens": 1069821082 }, { "epoch": 0.6774193548387096, "grad_norm": 0.8842048048973083, "learning_rate": 2e-05, "loss": 0.6053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10836, "tokens_per_second_per_gpu": 10053.63, "total_tokens": 1069918948 }, { "epoch": 0.6774818704676169, "grad_norm": 0.8713580369949341, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10837, "tokens_per_second_per_gpu": 10902.76, "total_tokens": 1070021584 }, { "epoch": 0.6775443860965241, "grad_norm": 0.9243414998054504, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10838, "tokens_per_second_per_gpu": 9851.37, "total_tokens": 1070120441 }, { "epoch": 0.6776069017254314, "grad_norm": 0.9147190451622009, "learning_rate": 2e-05, "loss": 0.696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10839, "tokens_per_second_per_gpu": 11000.02, "total_tokens": 1070223809 }, { "epoch": 0.6776694173543386, "grad_norm": 0.870880126953125, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10840, "tokens_per_second_per_gpu": 10977.21, "total_tokens": 1070325339 }, { "epoch": 0.6777319329832459, "grad_norm": 0.8697442412376404, "learning_rate": 2e-05, "loss": 0.6095, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10841, "tokens_per_second_per_gpu": 11030.01, "total_tokens": 1070426222 }, { "epoch": 0.677794448612153, "grad_norm": 0.850505530834198, "learning_rate": 2e-05, "loss": 0.6048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10842, "tokens_per_second_per_gpu": 11234.65, "total_tokens": 1070525146 }, { "epoch": 0.6778569642410602, "grad_norm": 0.8739424347877502, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10843, "tokens_per_second_per_gpu": 10292.17, "total_tokens": 1070623780 }, { "epoch": 0.6779194798699675, "grad_norm": 0.8999555706977844, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10844, "tokens_per_second_per_gpu": 10818.63, "total_tokens": 1070725936 }, { "epoch": 0.6779819954988747, "grad_norm": 0.8527818918228149, "learning_rate": 2e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10845, "tokens_per_second_per_gpu": 10634.03, "total_tokens": 1070825198 }, { "epoch": 0.678044511127782, "grad_norm": 0.9003362059593201, "learning_rate": 2e-05, "loss": 0.6422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10846, "tokens_per_second_per_gpu": 10717.89, "total_tokens": 1070926402 }, { "epoch": 0.6781070267566892, "grad_norm": 0.8713018894195557, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10847, "tokens_per_second_per_gpu": 10679.36, "total_tokens": 1071029880 }, { "epoch": 0.6781695423855963, "grad_norm": 0.9070158004760742, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10848, "tokens_per_second_per_gpu": 10568.68, "total_tokens": 1071124560 }, { "epoch": 0.6782320580145036, "grad_norm": 0.889148473739624, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10849, "tokens_per_second_per_gpu": 10093.65, "total_tokens": 1071222642 }, { "epoch": 0.6782945736434108, "grad_norm": 0.8751437664031982, "learning_rate": 2e-05, "loss": 0.5856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10850, "tokens_per_second_per_gpu": 10135.37, "total_tokens": 1071318736 }, { "epoch": 0.6783570892723181, "grad_norm": 0.898919939994812, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10851, "tokens_per_second_per_gpu": 10346.96, "total_tokens": 1071418638 }, { "epoch": 0.6784196049012253, "grad_norm": 0.9000781178474426, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10852, "tokens_per_second_per_gpu": 10662.66, "total_tokens": 1071515228 }, { "epoch": 0.6784821205301326, "grad_norm": 0.941264271736145, "learning_rate": 2e-05, "loss": 0.5687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10853, "tokens_per_second_per_gpu": 10768.15, "total_tokens": 1071611351 }, { "epoch": 0.6785446361590397, "grad_norm": 0.9104703664779663, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10854, "tokens_per_second_per_gpu": 10853.65, "total_tokens": 1071710771 }, { "epoch": 0.678607151787947, "grad_norm": 0.9578956961631775, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10855, "tokens_per_second_per_gpu": 10472.17, "total_tokens": 1071810034 }, { "epoch": 0.6786696674168542, "grad_norm": 0.9210514426231384, "learning_rate": 2e-05, "loss": 0.6502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10856, "tokens_per_second_per_gpu": 11467.26, "total_tokens": 1071912339 }, { "epoch": 0.6787321830457614, "grad_norm": 0.8913783431053162, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10857, "tokens_per_second_per_gpu": 10352.64, "total_tokens": 1072012633 }, { "epoch": 0.6787946986746687, "grad_norm": 0.900651216506958, "learning_rate": 2e-05, "loss": 0.5865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10858, "tokens_per_second_per_gpu": 10888.4, "total_tokens": 1072109377 }, { "epoch": 0.6788572143035759, "grad_norm": 0.8798837661743164, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10859, "tokens_per_second_per_gpu": 11180.92, "total_tokens": 1072210881 }, { "epoch": 0.6789197299324832, "grad_norm": 1.0105279684066772, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10860, "tokens_per_second_per_gpu": 10202.38, "total_tokens": 1072307780 }, { "epoch": 0.6789822455613903, "grad_norm": 0.8584458827972412, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10861, "tokens_per_second_per_gpu": 10180.69, "total_tokens": 1072411214 }, { "epoch": 0.6790447611902976, "grad_norm": 0.8494519591331482, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10862, "tokens_per_second_per_gpu": 11019.49, "total_tokens": 1072513846 }, { "epoch": 0.6791072768192048, "grad_norm": 0.8774095773696899, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10863, "tokens_per_second_per_gpu": 10544.36, "total_tokens": 1072612982 }, { "epoch": 0.679169792448112, "grad_norm": 0.9482675790786743, "learning_rate": 2e-05, "loss": 0.5886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10864, "tokens_per_second_per_gpu": 10420.33, "total_tokens": 1072709858 }, { "epoch": 0.6792323080770193, "grad_norm": 0.9226873517036438, "learning_rate": 2e-05, "loss": 0.6356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10865, "tokens_per_second_per_gpu": 9905.01, "total_tokens": 1072806129 }, { "epoch": 0.6792948237059265, "grad_norm": 0.878192126750946, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10866, "tokens_per_second_per_gpu": 10256.52, "total_tokens": 1072902496 }, { "epoch": 0.6793573393348337, "grad_norm": 0.8890897631645203, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10867, "tokens_per_second_per_gpu": 10590.69, "total_tokens": 1073004010 }, { "epoch": 0.6794198549637409, "grad_norm": 0.9061208963394165, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10868, "tokens_per_second_per_gpu": 10606.81, "total_tokens": 1073105775 }, { "epoch": 0.6794823705926482, "grad_norm": 0.8743976354598999, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10869, "tokens_per_second_per_gpu": 11773.21, "total_tokens": 1073209369 }, { "epoch": 0.6795448862215554, "grad_norm": 0.8957871794700623, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10870, "tokens_per_second_per_gpu": 11142.91, "total_tokens": 1073307177 }, { "epoch": 0.6796074018504626, "grad_norm": 0.8933908939361572, "learning_rate": 2e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10871, "tokens_per_second_per_gpu": 10340.22, "total_tokens": 1073405421 }, { "epoch": 0.6796699174793699, "grad_norm": 0.891322135925293, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10872, "tokens_per_second_per_gpu": 10724.12, "total_tokens": 1073507607 }, { "epoch": 0.679732433108277, "grad_norm": 0.9837465286254883, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10873, "tokens_per_second_per_gpu": 10561.51, "total_tokens": 1073605207 }, { "epoch": 0.6797949487371843, "grad_norm": 0.8729015588760376, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10874, "tokens_per_second_per_gpu": 10999.98, "total_tokens": 1073705606 }, { "epoch": 0.6798574643660915, "grad_norm": 0.8831802010536194, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10875, "tokens_per_second_per_gpu": 11115.78, "total_tokens": 1073805383 }, { "epoch": 0.6799199799949988, "grad_norm": 0.8682684898376465, "learning_rate": 2e-05, "loss": 0.6255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10876, "tokens_per_second_per_gpu": 10677.67, "total_tokens": 1073906360 }, { "epoch": 0.679982495623906, "grad_norm": 0.9061126112937927, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10877, "tokens_per_second_per_gpu": 9985.43, "total_tokens": 1073999631 }, { "epoch": 0.6800450112528132, "grad_norm": 0.9205923676490784, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10878, "tokens_per_second_per_gpu": 9517.04, "total_tokens": 1074098140 }, { "epoch": 0.6801075268817204, "grad_norm": 1.1163315773010254, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10879, "tokens_per_second_per_gpu": 10391.13, "total_tokens": 1074195099 }, { "epoch": 0.6801700425106276, "grad_norm": 0.864381730556488, "learning_rate": 2e-05, "loss": 0.6085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10880, "tokens_per_second_per_gpu": 10647.22, "total_tokens": 1074294330 }, { "epoch": 0.6802325581395349, "grad_norm": 0.9598019123077393, "learning_rate": 2e-05, "loss": 0.6935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10881, "tokens_per_second_per_gpu": 10272.66, "total_tokens": 1074392882 }, { "epoch": 0.6802950737684421, "grad_norm": 0.9452440142631531, "learning_rate": 2e-05, "loss": 0.6581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10882, "tokens_per_second_per_gpu": 10185.77, "total_tokens": 1074490366 }, { "epoch": 0.6803575893973494, "grad_norm": 0.9347766637802124, "learning_rate": 2e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10883, "tokens_per_second_per_gpu": 11080.69, "total_tokens": 1074591504 }, { "epoch": 0.6804201050262566, "grad_norm": 0.9046878814697266, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10884, "tokens_per_second_per_gpu": 10331.08, "total_tokens": 1074689187 }, { "epoch": 0.6804826206551637, "grad_norm": 0.9023765921592712, "learning_rate": 2e-05, "loss": 0.6608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10885, "tokens_per_second_per_gpu": 10903.79, "total_tokens": 1074788093 }, { "epoch": 0.680545136284071, "grad_norm": 0.8709825277328491, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10886, "tokens_per_second_per_gpu": 10553.81, "total_tokens": 1074886070 }, { "epoch": 0.6806076519129782, "grad_norm": 0.8889158368110657, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10887, "tokens_per_second_per_gpu": 11264.1, "total_tokens": 1074987327 }, { "epoch": 0.6806701675418855, "grad_norm": 0.9049647450447083, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10888, "tokens_per_second_per_gpu": 10220.79, "total_tokens": 1075085335 }, { "epoch": 0.6807326831707927, "grad_norm": 0.8793290257453918, "learning_rate": 2e-05, "loss": 0.5939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10889, "tokens_per_second_per_gpu": 9957.18, "total_tokens": 1075182692 }, { "epoch": 0.6807951987997, "grad_norm": 0.9194596409797668, "learning_rate": 2e-05, "loss": 0.5812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10890, "tokens_per_second_per_gpu": 9849.21, "total_tokens": 1075276937 }, { "epoch": 0.6808577144286071, "grad_norm": 0.927786648273468, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10891, "tokens_per_second_per_gpu": 10362.99, "total_tokens": 1075374117 }, { "epoch": 0.6809202300575143, "grad_norm": 0.9229403734207153, "learning_rate": 2e-05, "loss": 0.6791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10892, "tokens_per_second_per_gpu": 10965.81, "total_tokens": 1075473358 }, { "epoch": 0.6809827456864216, "grad_norm": 0.9017870426177979, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10893, "tokens_per_second_per_gpu": 11475.06, "total_tokens": 1075577712 }, { "epoch": 0.6810452613153288, "grad_norm": 0.8959168791770935, "learning_rate": 2e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10894, "tokens_per_second_per_gpu": 10938.42, "total_tokens": 1075676767 }, { "epoch": 0.6811077769442361, "grad_norm": 0.9402148127555847, "learning_rate": 2e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10895, "tokens_per_second_per_gpu": 10887.56, "total_tokens": 1075772475 }, { "epoch": 0.6811702925731433, "grad_norm": 0.8860181570053101, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10896, "tokens_per_second_per_gpu": 11104.63, "total_tokens": 1075871614 }, { "epoch": 0.6812328082020506, "grad_norm": 0.8834755420684814, "learning_rate": 2e-05, "loss": 0.5927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10897, "tokens_per_second_per_gpu": 10421.33, "total_tokens": 1075971119 }, { "epoch": 0.6812953238309577, "grad_norm": 0.9008244276046753, "learning_rate": 2e-05, "loss": 0.596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10898, "tokens_per_second_per_gpu": 10031.61, "total_tokens": 1076066364 }, { "epoch": 0.681357839459865, "grad_norm": 0.9492859244346619, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10899, "tokens_per_second_per_gpu": 11016.25, "total_tokens": 1076169340 }, { "epoch": 0.6814203550887722, "grad_norm": 0.8990079164505005, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10900, "tokens_per_second_per_gpu": 10925.6, "total_tokens": 1076274897 }, { "epoch": 0.6814828707176794, "grad_norm": 0.9144136905670166, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10901, "tokens_per_second_per_gpu": 10101.05, "total_tokens": 1076374565 }, { "epoch": 0.6815453863465867, "grad_norm": 0.975801408290863, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10902, "tokens_per_second_per_gpu": 9954.94, "total_tokens": 1076470786 }, { "epoch": 0.6816079019754939, "grad_norm": 0.8980146050453186, "learning_rate": 2e-05, "loss": 0.6216, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10903, "tokens_per_second_per_gpu": 11429.75, "total_tokens": 1076571076 }, { "epoch": 0.6816704176044011, "grad_norm": 0.8855369687080383, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10904, "tokens_per_second_per_gpu": 10637.35, "total_tokens": 1076671548 }, { "epoch": 0.6817329332333083, "grad_norm": 0.8617079257965088, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10905, "tokens_per_second_per_gpu": 11103.95, "total_tokens": 1076775184 }, { "epoch": 0.6817954488622155, "grad_norm": 0.9297115206718445, "learning_rate": 2e-05, "loss": 0.6802, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10906, "tokens_per_second_per_gpu": 10519.16, "total_tokens": 1076872624 }, { "epoch": 0.6818579644911228, "grad_norm": 0.936443567276001, "learning_rate": 2e-05, "loss": 0.5555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10907, "tokens_per_second_per_gpu": 9605.69, "total_tokens": 1076968493 }, { "epoch": 0.68192048012003, "grad_norm": 0.940557062625885, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10908, "tokens_per_second_per_gpu": 10537.28, "total_tokens": 1077064805 }, { "epoch": 0.6819829957489373, "grad_norm": 0.8450436592102051, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10909, "tokens_per_second_per_gpu": 10300.29, "total_tokens": 1077165095 }, { "epoch": 0.6820455113778444, "grad_norm": 0.89073246717453, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10910, "tokens_per_second_per_gpu": 10707.67, "total_tokens": 1077263471 }, { "epoch": 0.6821080270067517, "grad_norm": 0.9475083351135254, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10911, "tokens_per_second_per_gpu": 10572.57, "total_tokens": 1077363531 }, { "epoch": 0.6821705426356589, "grad_norm": 0.9187122583389282, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10912, "tokens_per_second_per_gpu": 9593.83, "total_tokens": 1077458779 }, { "epoch": 0.6822330582645662, "grad_norm": 0.909180223941803, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10913, "tokens_per_second_per_gpu": 10538.31, "total_tokens": 1077559099 }, { "epoch": 0.6822955738934734, "grad_norm": 0.8808305263519287, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10914, "tokens_per_second_per_gpu": 10406.1, "total_tokens": 1077659975 }, { "epoch": 0.6823580895223806, "grad_norm": 0.8689979910850525, "learning_rate": 2e-05, "loss": 0.5908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10915, "tokens_per_second_per_gpu": 10288.69, "total_tokens": 1077756897 }, { "epoch": 0.6824206051512878, "grad_norm": 0.946880578994751, "learning_rate": 2e-05, "loss": 0.6508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10916, "tokens_per_second_per_gpu": 11145.4, "total_tokens": 1077860257 }, { "epoch": 0.682483120780195, "grad_norm": 0.882310688495636, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10917, "tokens_per_second_per_gpu": 11038.25, "total_tokens": 1077959430 }, { "epoch": 0.6825456364091023, "grad_norm": 0.8913652896881104, "learning_rate": 2e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10918, "tokens_per_second_per_gpu": 10541.03, "total_tokens": 1078058439 }, { "epoch": 0.6826081520380095, "grad_norm": 0.8883427381515503, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10919, "tokens_per_second_per_gpu": 10300.52, "total_tokens": 1078153413 }, { "epoch": 0.6826706676669168, "grad_norm": 0.939187228679657, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10920, "tokens_per_second_per_gpu": 10517.98, "total_tokens": 1078252579 }, { "epoch": 0.682733183295824, "grad_norm": 0.8971418142318726, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10921, "tokens_per_second_per_gpu": 10561.12, "total_tokens": 1078350062 }, { "epoch": 0.6827956989247311, "grad_norm": 0.8620502948760986, "learning_rate": 2e-05, "loss": 0.5961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10922, "tokens_per_second_per_gpu": 10861.61, "total_tokens": 1078450539 }, { "epoch": 0.6828582145536384, "grad_norm": 0.896229088306427, "learning_rate": 2e-05, "loss": 0.6489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10923, "tokens_per_second_per_gpu": 10526.36, "total_tokens": 1078552364 }, { "epoch": 0.6829207301825456, "grad_norm": 0.9101691246032715, "learning_rate": 2e-05, "loss": 0.6517, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10924, "tokens_per_second_per_gpu": 10775.06, "total_tokens": 1078656634 }, { "epoch": 0.6829832458114529, "grad_norm": 0.9146303534507751, "learning_rate": 2e-05, "loss": 0.5877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10925, "tokens_per_second_per_gpu": 10247.25, "total_tokens": 1078755791 }, { "epoch": 0.6830457614403601, "grad_norm": 0.8422977924346924, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10926, "tokens_per_second_per_gpu": 10891.53, "total_tokens": 1078857464 }, { "epoch": 0.6831082770692674, "grad_norm": 0.8839962482452393, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10927, "tokens_per_second_per_gpu": 10877.85, "total_tokens": 1078959348 }, { "epoch": 0.6831707926981745, "grad_norm": 0.8838193416595459, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10928, "tokens_per_second_per_gpu": 10732.03, "total_tokens": 1079057927 }, { "epoch": 0.6832333083270817, "grad_norm": 0.8979529142379761, "learning_rate": 2e-05, "loss": 0.6426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10929, "tokens_per_second_per_gpu": 10535.67, "total_tokens": 1079159304 }, { "epoch": 0.683295823955989, "grad_norm": 0.8935354351997375, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10930, "tokens_per_second_per_gpu": 10702.35, "total_tokens": 1079260829 }, { "epoch": 0.6833583395848962, "grad_norm": 0.8883510828018188, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10931, "tokens_per_second_per_gpu": 10850.62, "total_tokens": 1079361629 }, { "epoch": 0.6834208552138035, "grad_norm": 0.9159126877784729, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10932, "tokens_per_second_per_gpu": 9928.81, "total_tokens": 1079461969 }, { "epoch": 0.6834833708427107, "grad_norm": 0.9039415121078491, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10933, "tokens_per_second_per_gpu": 10891.84, "total_tokens": 1079568084 }, { "epoch": 0.683545886471618, "grad_norm": 0.9243053197860718, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10934, "tokens_per_second_per_gpu": 10477.07, "total_tokens": 1079665242 }, { "epoch": 0.6836084021005251, "grad_norm": 0.8936998844146729, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10935, "tokens_per_second_per_gpu": 10428.4, "total_tokens": 1079763540 }, { "epoch": 0.6836709177294323, "grad_norm": 0.9066889882087708, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10936, "tokens_per_second_per_gpu": 10228.19, "total_tokens": 1079863031 }, { "epoch": 0.6837334333583396, "grad_norm": 0.8986935615539551, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10937, "tokens_per_second_per_gpu": 10720.77, "total_tokens": 1079957999 }, { "epoch": 0.6837959489872468, "grad_norm": 0.9154784083366394, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10938, "tokens_per_second_per_gpu": 10567.17, "total_tokens": 1080061789 }, { "epoch": 0.6838584646161541, "grad_norm": 0.8529427647590637, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10939, "tokens_per_second_per_gpu": 10832.73, "total_tokens": 1080167182 }, { "epoch": 0.6839209802450613, "grad_norm": 0.9128632545471191, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10940, "tokens_per_second_per_gpu": 10884.95, "total_tokens": 1080266834 }, { "epoch": 0.6839834958739685, "grad_norm": 0.8914470672607422, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10941, "tokens_per_second_per_gpu": 10688.64, "total_tokens": 1080361684 }, { "epoch": 0.6840460115028757, "grad_norm": 0.8751394748687744, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10942, "tokens_per_second_per_gpu": 11082.33, "total_tokens": 1080462226 }, { "epoch": 0.6841085271317829, "grad_norm": 0.8762344717979431, "learning_rate": 2e-05, "loss": 0.6226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10943, "tokens_per_second_per_gpu": 10374.06, "total_tokens": 1080562535 }, { "epoch": 0.6841710427606902, "grad_norm": 0.8662176728248596, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10944, "tokens_per_second_per_gpu": 10591.05, "total_tokens": 1080661142 }, { "epoch": 0.6842335583895974, "grad_norm": 0.8434469103813171, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10945, "tokens_per_second_per_gpu": 11272.84, "total_tokens": 1080764983 }, { "epoch": 0.6842960740185047, "grad_norm": 0.8827211260795593, "learning_rate": 2e-05, "loss": 0.5864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10946, "tokens_per_second_per_gpu": 12571.85, "total_tokens": 1080859572 }, { "epoch": 0.6843585896474118, "grad_norm": 0.8888753056526184, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10947, "tokens_per_second_per_gpu": 9730.52, "total_tokens": 1080956818 }, { "epoch": 0.684421105276319, "grad_norm": 0.8513634204864502, "learning_rate": 2e-05, "loss": 0.5719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10948, "tokens_per_second_per_gpu": 9843.25, "total_tokens": 1081053439 }, { "epoch": 0.6844836209052263, "grad_norm": 0.873914361000061, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10949, "tokens_per_second_per_gpu": 10160.17, "total_tokens": 1081155684 }, { "epoch": 0.6845461365341335, "grad_norm": 0.910144031047821, "learning_rate": 2e-05, "loss": 0.684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10950, "tokens_per_second_per_gpu": 10462.08, "total_tokens": 1081253657 }, { "epoch": 0.6846086521630408, "grad_norm": 0.9031227231025696, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10951, "tokens_per_second_per_gpu": 11275.49, "total_tokens": 1081355367 }, { "epoch": 0.684671167791948, "grad_norm": 0.856275200843811, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10952, "tokens_per_second_per_gpu": 10842.48, "total_tokens": 1081457442 }, { "epoch": 0.6847336834208552, "grad_norm": 0.8915192484855652, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10953, "tokens_per_second_per_gpu": 11125.13, "total_tokens": 1081557464 }, { "epoch": 0.6847961990497624, "grad_norm": 0.8887627124786377, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10954, "tokens_per_second_per_gpu": 10975.8, "total_tokens": 1081656545 }, { "epoch": 0.6848587146786697, "grad_norm": 0.8609397411346436, "learning_rate": 2e-05, "loss": 0.6037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10955, "tokens_per_second_per_gpu": 10561.65, "total_tokens": 1081759034 }, { "epoch": 0.6849212303075769, "grad_norm": 0.8988292217254639, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10956, "tokens_per_second_per_gpu": 10605.06, "total_tokens": 1081859944 }, { "epoch": 0.6849837459364841, "grad_norm": 0.8839790225028992, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10957, "tokens_per_second_per_gpu": 10248.24, "total_tokens": 1081956386 }, { "epoch": 0.6850462615653914, "grad_norm": 0.9367501735687256, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10958, "tokens_per_second_per_gpu": 9363.99, "total_tokens": 1082049593 }, { "epoch": 0.6851087771942985, "grad_norm": 0.9159179925918579, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10959, "tokens_per_second_per_gpu": 9590.39, "total_tokens": 1082143454 }, { "epoch": 0.6851712928232058, "grad_norm": 0.8880190849304199, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10960, "tokens_per_second_per_gpu": 10517.89, "total_tokens": 1082240103 }, { "epoch": 0.685233808452113, "grad_norm": 0.8543533682823181, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10961, "tokens_per_second_per_gpu": 11430.83, "total_tokens": 1082344152 }, { "epoch": 0.6852963240810203, "grad_norm": 0.8967307806015015, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10962, "tokens_per_second_per_gpu": 10579.92, "total_tokens": 1082441815 }, { "epoch": 0.6853588397099275, "grad_norm": 0.8661065697669983, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10963, "tokens_per_second_per_gpu": 10576.92, "total_tokens": 1082541944 }, { "epoch": 0.6854213553388347, "grad_norm": 0.870917022228241, "learning_rate": 2e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10964, "tokens_per_second_per_gpu": 10096.01, "total_tokens": 1082638033 }, { "epoch": 0.6854838709677419, "grad_norm": 0.8795871734619141, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10965, "tokens_per_second_per_gpu": 10794.09, "total_tokens": 1082736622 }, { "epoch": 0.6855463865966491, "grad_norm": 0.879608154296875, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10966, "tokens_per_second_per_gpu": 11539.71, "total_tokens": 1082837556 }, { "epoch": 0.6856089022255564, "grad_norm": 0.8810030221939087, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10967, "tokens_per_second_per_gpu": 11044.91, "total_tokens": 1082936890 }, { "epoch": 0.6856714178544636, "grad_norm": 0.867336630821228, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10968, "tokens_per_second_per_gpu": 10355.9, "total_tokens": 1083035212 }, { "epoch": 0.6857339334833709, "grad_norm": 0.9092862010002136, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10969, "tokens_per_second_per_gpu": 10507.23, "total_tokens": 1083129580 }, { "epoch": 0.6857964491122781, "grad_norm": 0.8524524569511414, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10970, "tokens_per_second_per_gpu": 10590.25, "total_tokens": 1083227068 }, { "epoch": 0.6858589647411854, "grad_norm": 0.8929662704467773, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10971, "tokens_per_second_per_gpu": 10792.26, "total_tokens": 1083328010 }, { "epoch": 0.6859214803700925, "grad_norm": 0.8516675233840942, "learning_rate": 2e-05, "loss": 0.6087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10972, "tokens_per_second_per_gpu": 10728.87, "total_tokens": 1083426485 }, { "epoch": 0.6859839959989997, "grad_norm": 0.970795750617981, "learning_rate": 2e-05, "loss": 0.6627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10973, "tokens_per_second_per_gpu": 10465.63, "total_tokens": 1083529051 }, { "epoch": 0.686046511627907, "grad_norm": 0.8809853196144104, "learning_rate": 2e-05, "loss": 0.5872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10974, "tokens_per_second_per_gpu": 11007.78, "total_tokens": 1083626119 }, { "epoch": 0.6861090272568142, "grad_norm": 0.8832497596740723, "learning_rate": 2e-05, "loss": 0.6618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10975, "tokens_per_second_per_gpu": 10595.47, "total_tokens": 1083725745 }, { "epoch": 0.6861715428857215, "grad_norm": 0.8625187277793884, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10976, "tokens_per_second_per_gpu": 11331.97, "total_tokens": 1083825602 }, { "epoch": 0.6862340585146287, "grad_norm": 0.874007523059845, "learning_rate": 2e-05, "loss": 0.5845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10977, "tokens_per_second_per_gpu": 9887.82, "total_tokens": 1083921964 }, { "epoch": 0.6862965741435358, "grad_norm": 0.9391837120056152, "learning_rate": 2e-05, "loss": 0.6665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10978, "tokens_per_second_per_gpu": 10444.64, "total_tokens": 1084021367 }, { "epoch": 0.6863590897724431, "grad_norm": 0.8880404233932495, "learning_rate": 2e-05, "loss": 0.5981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10979, "tokens_per_second_per_gpu": 10358.94, "total_tokens": 1084120623 }, { "epoch": 0.6864216054013503, "grad_norm": 0.900119423866272, "learning_rate": 2e-05, "loss": 0.6541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10980, "tokens_per_second_per_gpu": 10064.78, "total_tokens": 1084217296 }, { "epoch": 0.6864841210302576, "grad_norm": 0.8932744264602661, "learning_rate": 2e-05, "loss": 0.5914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10981, "tokens_per_second_per_gpu": 10878.7, "total_tokens": 1084314043 }, { "epoch": 0.6865466366591648, "grad_norm": 0.8689725399017334, "learning_rate": 2e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10982, "tokens_per_second_per_gpu": 10264.23, "total_tokens": 1084410805 }, { "epoch": 0.6866091522880721, "grad_norm": 0.8609076738357544, "learning_rate": 2e-05, "loss": 0.6355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10983, "tokens_per_second_per_gpu": 11410.36, "total_tokens": 1084513452 }, { "epoch": 0.6866716679169792, "grad_norm": 0.8846040368080139, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10984, "tokens_per_second_per_gpu": 10630.75, "total_tokens": 1084613975 }, { "epoch": 0.6867341835458864, "grad_norm": 0.8926864266395569, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10985, "tokens_per_second_per_gpu": 10599.56, "total_tokens": 1084715122 }, { "epoch": 0.6867966991747937, "grad_norm": 0.8982280492782593, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10986, "tokens_per_second_per_gpu": 10234.38, "total_tokens": 1084814217 }, { "epoch": 0.6868592148037009, "grad_norm": 0.856646716594696, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10987, "tokens_per_second_per_gpu": 11005.98, "total_tokens": 1084918152 }, { "epoch": 0.6869217304326082, "grad_norm": 0.877672553062439, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10988, "tokens_per_second_per_gpu": 11140.78, "total_tokens": 1085022022 }, { "epoch": 0.6869842460615154, "grad_norm": 0.8624589443206787, "learning_rate": 2e-05, "loss": 0.6211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10989, "tokens_per_second_per_gpu": 10826.93, "total_tokens": 1085123039 }, { "epoch": 0.6870467616904226, "grad_norm": 0.9155689477920532, "learning_rate": 2e-05, "loss": 0.6487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10990, "tokens_per_second_per_gpu": 11069.59, "total_tokens": 1085221902 }, { "epoch": 0.6871092773193298, "grad_norm": 0.8745464086532593, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10991, "tokens_per_second_per_gpu": 10763.87, "total_tokens": 1085322412 }, { "epoch": 0.687171792948237, "grad_norm": 0.8515556454658508, "learning_rate": 2e-05, "loss": 0.5777, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10992, "tokens_per_second_per_gpu": 10992.74, "total_tokens": 1085422776 }, { "epoch": 0.6872343085771443, "grad_norm": 0.9505006670951843, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10993, "tokens_per_second_per_gpu": 10317.04, "total_tokens": 1085521103 }, { "epoch": 0.6872968242060515, "grad_norm": 0.8649444580078125, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10994, "tokens_per_second_per_gpu": 10509.1, "total_tokens": 1085619791 }, { "epoch": 0.6873593398349588, "grad_norm": 0.9534171223640442, "learning_rate": 2e-05, "loss": 0.6806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10995, "tokens_per_second_per_gpu": 10011.72, "total_tokens": 1085716918 }, { "epoch": 0.6874218554638659, "grad_norm": 0.8529025912284851, "learning_rate": 2e-05, "loss": 0.5981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10996, "tokens_per_second_per_gpu": 10634.68, "total_tokens": 1085818083 }, { "epoch": 0.6874843710927732, "grad_norm": 0.847510039806366, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10997, "tokens_per_second_per_gpu": 10324.52, "total_tokens": 1085916887 }, { "epoch": 0.6875468867216804, "grad_norm": 0.9111329317092896, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10998, "tokens_per_second_per_gpu": 10528.49, "total_tokens": 1086016908 }, { "epoch": 0.6876094023505877, "grad_norm": 0.8930442333221436, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 10999, "tokens_per_second_per_gpu": 11231.0, "total_tokens": 1086120173 }, { "epoch": 0.6876719179794949, "grad_norm": 0.918786883354187, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11000, "tokens_per_second_per_gpu": 10075.19, "total_tokens": 1086213955 }, { "epoch": 0.6877344336084021, "grad_norm": 0.9157921671867371, "learning_rate": 2e-05, "loss": 0.6671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11001, "tokens_per_second_per_gpu": 10703.62, "total_tokens": 1086316133 }, { "epoch": 0.6877969492373093, "grad_norm": 0.8756975531578064, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11002, "tokens_per_second_per_gpu": 11131.88, "total_tokens": 1086418494 }, { "epoch": 0.6878594648662165, "grad_norm": 0.9749639630317688, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11003, "tokens_per_second_per_gpu": 10142.03, "total_tokens": 1086517144 }, { "epoch": 0.6879219804951238, "grad_norm": 0.890057384967804, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11004, "tokens_per_second_per_gpu": 10690.97, "total_tokens": 1086614931 }, { "epoch": 0.687984496124031, "grad_norm": 0.8955698609352112, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11005, "tokens_per_second_per_gpu": 10548.56, "total_tokens": 1086714964 }, { "epoch": 0.6880470117529383, "grad_norm": 0.9065214395523071, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11006, "tokens_per_second_per_gpu": 11106.48, "total_tokens": 1086815959 }, { "epoch": 0.6881095273818455, "grad_norm": 0.8780428767204285, "learning_rate": 2e-05, "loss": 0.6619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11007, "tokens_per_second_per_gpu": 10572.24, "total_tokens": 1086916451 }, { "epoch": 0.6881720430107527, "grad_norm": 0.8878204226493835, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11008, "tokens_per_second_per_gpu": 10037.62, "total_tokens": 1087012231 }, { "epoch": 0.6882345586396599, "grad_norm": 0.8815022706985474, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11009, "tokens_per_second_per_gpu": 10649.76, "total_tokens": 1087114524 }, { "epoch": 0.6882970742685671, "grad_norm": 0.8816601037979126, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11010, "tokens_per_second_per_gpu": 10348.95, "total_tokens": 1087212855 }, { "epoch": 0.6883595898974744, "grad_norm": 0.8461988568305969, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11011, "tokens_per_second_per_gpu": 10306.17, "total_tokens": 1087311784 }, { "epoch": 0.6884221055263816, "grad_norm": 0.9123976826667786, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11012, "tokens_per_second_per_gpu": 10827.05, "total_tokens": 1087414015 }, { "epoch": 0.6884846211552889, "grad_norm": 0.9223527312278748, "learning_rate": 2e-05, "loss": 0.6856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11013, "tokens_per_second_per_gpu": 11669.78, "total_tokens": 1087520742 }, { "epoch": 0.6885471367841961, "grad_norm": 0.9163883328437805, "learning_rate": 2e-05, "loss": 0.5846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11014, "tokens_per_second_per_gpu": 10431.63, "total_tokens": 1087614412 }, { "epoch": 0.6886096524131032, "grad_norm": 0.8724128007888794, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11015, "tokens_per_second_per_gpu": 10864.45, "total_tokens": 1087718398 }, { "epoch": 0.6886721680420105, "grad_norm": 0.8639160990715027, "learning_rate": 2e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11016, "tokens_per_second_per_gpu": 11293.81, "total_tokens": 1087819293 }, { "epoch": 0.6887346836709177, "grad_norm": 0.9159881472587585, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11017, "tokens_per_second_per_gpu": 9857.73, "total_tokens": 1087915427 }, { "epoch": 0.688797199299825, "grad_norm": 0.8761580586433411, "learning_rate": 2e-05, "loss": 0.6365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11018, "tokens_per_second_per_gpu": 9938.64, "total_tokens": 1088015921 }, { "epoch": 0.6888597149287322, "grad_norm": 0.9210368990898132, "learning_rate": 2e-05, "loss": 0.5888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11019, "tokens_per_second_per_gpu": 10189.55, "total_tokens": 1088111521 }, { "epoch": 0.6889222305576395, "grad_norm": 0.8886882066726685, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11020, "tokens_per_second_per_gpu": 10263.45, "total_tokens": 1088212831 }, { "epoch": 0.6889847461865466, "grad_norm": 0.879943311214447, "learning_rate": 2e-05, "loss": 0.5765, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11021, "tokens_per_second_per_gpu": 10668.13, "total_tokens": 1088311167 }, { "epoch": 0.6890472618154538, "grad_norm": 0.8902972936630249, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11022, "tokens_per_second_per_gpu": 10520.97, "total_tokens": 1088407427 }, { "epoch": 0.6891097774443611, "grad_norm": 0.899114191532135, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11023, "tokens_per_second_per_gpu": 10454.44, "total_tokens": 1088506174 }, { "epoch": 0.6891722930732683, "grad_norm": 0.8658840656280518, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11024, "tokens_per_second_per_gpu": 10604.36, "total_tokens": 1088609174 }, { "epoch": 0.6892348087021756, "grad_norm": 0.8884690999984741, "learning_rate": 2e-05, "loss": 0.5845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11025, "tokens_per_second_per_gpu": 9468.98, "total_tokens": 1088703837 }, { "epoch": 0.6892973243310828, "grad_norm": 0.887985348701477, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11026, "tokens_per_second_per_gpu": 9905.63, "total_tokens": 1088798649 }, { "epoch": 0.68935983995999, "grad_norm": 0.8771805763244629, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11027, "tokens_per_second_per_gpu": 10584.86, "total_tokens": 1088897029 }, { "epoch": 0.6894223555888972, "grad_norm": 0.850411057472229, "learning_rate": 2e-05, "loss": 0.6057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11028, "tokens_per_second_per_gpu": 10488.99, "total_tokens": 1088993936 }, { "epoch": 0.6894848712178044, "grad_norm": 0.8450949788093567, "learning_rate": 2e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11029, "tokens_per_second_per_gpu": 9964.94, "total_tokens": 1089091584 }, { "epoch": 0.6895473868467117, "grad_norm": 0.8848236203193665, "learning_rate": 2e-05, "loss": 0.6508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11030, "tokens_per_second_per_gpu": 10789.97, "total_tokens": 1089189993 }, { "epoch": 0.6896099024756189, "grad_norm": 0.8798092603683472, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11031, "tokens_per_second_per_gpu": 10696.68, "total_tokens": 1089288818 }, { "epoch": 0.6896724181045262, "grad_norm": 0.9252857565879822, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11032, "tokens_per_second_per_gpu": 11248.42, "total_tokens": 1089390845 }, { "epoch": 0.6897349337334333, "grad_norm": 0.9265497326850891, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11033, "tokens_per_second_per_gpu": 10795.4, "total_tokens": 1089490502 }, { "epoch": 0.6897974493623406, "grad_norm": 0.9304255843162537, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11034, "tokens_per_second_per_gpu": 10703.16, "total_tokens": 1089590364 }, { "epoch": 0.6898599649912478, "grad_norm": 0.8824819922447205, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11035, "tokens_per_second_per_gpu": 9997.44, "total_tokens": 1089687980 }, { "epoch": 0.689922480620155, "grad_norm": 0.8924325704574585, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11036, "tokens_per_second_per_gpu": 10365.99, "total_tokens": 1089787153 }, { "epoch": 0.6899849962490623, "grad_norm": 0.866936981678009, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11037, "tokens_per_second_per_gpu": 11280.67, "total_tokens": 1089893797 }, { "epoch": 0.6900475118779695, "grad_norm": 0.9127984642982483, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11038, "tokens_per_second_per_gpu": 10454.23, "total_tokens": 1089995511 }, { "epoch": 0.6901100275068767, "grad_norm": 0.8625684380531311, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11039, "tokens_per_second_per_gpu": 11553.8, "total_tokens": 1090098055 }, { "epoch": 0.6901725431357839, "grad_norm": 0.8897787928581238, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11040, "tokens_per_second_per_gpu": 9885.12, "total_tokens": 1090195063 }, { "epoch": 0.6902350587646912, "grad_norm": 0.9204707741737366, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11041, "tokens_per_second_per_gpu": 10294.08, "total_tokens": 1090291618 }, { "epoch": 0.6902975743935984, "grad_norm": 0.8708849549293518, "learning_rate": 2e-05, "loss": 0.6026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11042, "tokens_per_second_per_gpu": 10530.29, "total_tokens": 1090391254 }, { "epoch": 0.6903600900225056, "grad_norm": 0.8851704001426697, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11043, "tokens_per_second_per_gpu": 10524.33, "total_tokens": 1090494431 }, { "epoch": 0.6904226056514129, "grad_norm": 0.9543933868408203, "learning_rate": 2e-05, "loss": 0.656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11044, "tokens_per_second_per_gpu": 10251.1, "total_tokens": 1090589385 }, { "epoch": 0.6904851212803201, "grad_norm": 0.876549482345581, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11045, "tokens_per_second_per_gpu": 10741.21, "total_tokens": 1090688452 }, { "epoch": 0.6905476369092273, "grad_norm": 0.8483208417892456, "learning_rate": 2e-05, "loss": 0.5865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11046, "tokens_per_second_per_gpu": 11418.31, "total_tokens": 1090791591 }, { "epoch": 0.6906101525381345, "grad_norm": 0.9011995196342468, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11047, "tokens_per_second_per_gpu": 10322.89, "total_tokens": 1090887627 }, { "epoch": 0.6906726681670418, "grad_norm": 0.8874435424804688, "learning_rate": 2e-05, "loss": 0.5798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11048, "tokens_per_second_per_gpu": 10274.75, "total_tokens": 1090983140 }, { "epoch": 0.690735183795949, "grad_norm": 0.8908029198646545, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11049, "tokens_per_second_per_gpu": 10659.68, "total_tokens": 1091084515 }, { "epoch": 0.6907976994248562, "grad_norm": 0.8731134533882141, "learning_rate": 2e-05, "loss": 0.5853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11050, "tokens_per_second_per_gpu": 9745.84, "total_tokens": 1091180064 }, { "epoch": 0.6908602150537635, "grad_norm": 0.8837382197380066, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11051, "tokens_per_second_per_gpu": 10597.5, "total_tokens": 1091279784 }, { "epoch": 0.6909227306826706, "grad_norm": 0.9175143241882324, "learning_rate": 2e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11052, "tokens_per_second_per_gpu": 9799.95, "total_tokens": 1091373758 }, { "epoch": 0.6909852463115779, "grad_norm": 0.8472908735275269, "learning_rate": 2e-05, "loss": 0.5952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11053, "tokens_per_second_per_gpu": 10902.7, "total_tokens": 1091472622 }, { "epoch": 0.6910477619404851, "grad_norm": 0.8924551606178284, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11054, "tokens_per_second_per_gpu": 10365.66, "total_tokens": 1091569598 }, { "epoch": 0.6911102775693924, "grad_norm": 0.9004071354866028, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11055, "tokens_per_second_per_gpu": 10940.71, "total_tokens": 1091669555 }, { "epoch": 0.6911727931982996, "grad_norm": 0.8927003145217896, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11056, "tokens_per_second_per_gpu": 10923.72, "total_tokens": 1091768505 }, { "epoch": 0.6912353088272069, "grad_norm": 0.8830884695053101, "learning_rate": 2e-05, "loss": 0.664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11057, "tokens_per_second_per_gpu": 10473.93, "total_tokens": 1091869080 }, { "epoch": 0.691297824456114, "grad_norm": 0.8681142330169678, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11058, "tokens_per_second_per_gpu": 10776.82, "total_tokens": 1091971313 }, { "epoch": 0.6913603400850212, "grad_norm": 0.8929962515830994, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11059, "tokens_per_second_per_gpu": 10072.6, "total_tokens": 1092067957 }, { "epoch": 0.6914228557139285, "grad_norm": 0.884852409362793, "learning_rate": 2e-05, "loss": 0.5987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11060, "tokens_per_second_per_gpu": 10160.19, "total_tokens": 1092164766 }, { "epoch": 0.6914853713428357, "grad_norm": 0.8767085671424866, "learning_rate": 2e-05, "loss": 0.5917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11061, "tokens_per_second_per_gpu": 10426.56, "total_tokens": 1092261459 }, { "epoch": 0.691547886971743, "grad_norm": 0.8924128413200378, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11062, "tokens_per_second_per_gpu": 10282.34, "total_tokens": 1092358166 }, { "epoch": 0.6916104026006502, "grad_norm": 0.8883213400840759, "learning_rate": 2e-05, "loss": 0.58, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11063, "tokens_per_second_per_gpu": 10185.37, "total_tokens": 1092455488 }, { "epoch": 0.6916729182295573, "grad_norm": 0.8929161429405212, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11064, "tokens_per_second_per_gpu": 9863.35, "total_tokens": 1092554106 }, { "epoch": 0.6917354338584646, "grad_norm": 0.8652510643005371, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11065, "tokens_per_second_per_gpu": 10574.01, "total_tokens": 1092653771 }, { "epoch": 0.6917979494873718, "grad_norm": 0.8846624493598938, "learning_rate": 2e-05, "loss": 0.5621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11066, "tokens_per_second_per_gpu": 10340.29, "total_tokens": 1092748006 }, { "epoch": 0.6918604651162791, "grad_norm": 0.9077754616737366, "learning_rate": 2e-05, "loss": 0.6369, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11067, "tokens_per_second_per_gpu": 10500.88, "total_tokens": 1092848199 }, { "epoch": 0.6919229807451863, "grad_norm": 0.8921433687210083, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11068, "tokens_per_second_per_gpu": 10652.42, "total_tokens": 1092950003 }, { "epoch": 0.6919854963740936, "grad_norm": 0.8730592727661133, "learning_rate": 2e-05, "loss": 0.6045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11069, "tokens_per_second_per_gpu": 10851.59, "total_tokens": 1093048758 }, { "epoch": 0.6920480120030007, "grad_norm": 0.8748107552528381, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11070, "tokens_per_second_per_gpu": 11007.43, "total_tokens": 1093147601 }, { "epoch": 0.692110527631908, "grad_norm": 0.8707382678985596, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11071, "tokens_per_second_per_gpu": 11268.63, "total_tokens": 1093248567 }, { "epoch": 0.6921730432608152, "grad_norm": 0.9239550232887268, "learning_rate": 2e-05, "loss": 0.6237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11072, "tokens_per_second_per_gpu": 10399.93, "total_tokens": 1093348409 }, { "epoch": 0.6922355588897224, "grad_norm": 0.8866915702819824, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11073, "tokens_per_second_per_gpu": 10186.67, "total_tokens": 1093446546 }, { "epoch": 0.6922980745186297, "grad_norm": 0.9111559987068176, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11074, "tokens_per_second_per_gpu": 10229.89, "total_tokens": 1093545416 }, { "epoch": 0.6923605901475369, "grad_norm": 0.9035647511482239, "learning_rate": 2e-05, "loss": 0.6426, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11075, "tokens_per_second_per_gpu": 10976.77, "total_tokens": 1093646217 }, { "epoch": 0.6924231057764441, "grad_norm": 0.9129696488380432, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11076, "tokens_per_second_per_gpu": 10172.5, "total_tokens": 1093743486 }, { "epoch": 0.6924856214053513, "grad_norm": 0.9102100729942322, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11077, "tokens_per_second_per_gpu": 10060.54, "total_tokens": 1093835891 }, { "epoch": 0.6925481370342585, "grad_norm": 0.9256289005279541, "learning_rate": 2e-05, "loss": 0.6837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11078, "tokens_per_second_per_gpu": 10900.24, "total_tokens": 1093935769 }, { "epoch": 0.6926106526631658, "grad_norm": 0.9159390330314636, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11079, "tokens_per_second_per_gpu": 10489.85, "total_tokens": 1094036056 }, { "epoch": 0.692673168292073, "grad_norm": 0.8963721990585327, "learning_rate": 2e-05, "loss": 0.5961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11080, "tokens_per_second_per_gpu": 10034.73, "total_tokens": 1094133415 }, { "epoch": 0.6927356839209803, "grad_norm": 0.9862346053123474, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11081, "tokens_per_second_per_gpu": 9225.19, "total_tokens": 1094224875 }, { "epoch": 0.6927981995498874, "grad_norm": 0.8741656541824341, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11082, "tokens_per_second_per_gpu": 10288.21, "total_tokens": 1094326727 }, { "epoch": 0.6928607151787947, "grad_norm": 0.9062965512275696, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11083, "tokens_per_second_per_gpu": 10725.47, "total_tokens": 1094426626 }, { "epoch": 0.6929232308077019, "grad_norm": 0.9285746812820435, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11084, "tokens_per_second_per_gpu": 10515.32, "total_tokens": 1094524853 }, { "epoch": 0.6929857464366092, "grad_norm": 0.9090070128440857, "learning_rate": 2e-05, "loss": 0.6613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11085, "tokens_per_second_per_gpu": 11115.92, "total_tokens": 1094627928 }, { "epoch": 0.6930482620655164, "grad_norm": 0.917486310005188, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11086, "tokens_per_second_per_gpu": 9516.73, "total_tokens": 1094719891 }, { "epoch": 0.6931107776944236, "grad_norm": 0.8885623812675476, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11087, "tokens_per_second_per_gpu": 10326.75, "total_tokens": 1094821355 }, { "epoch": 0.6931732933233309, "grad_norm": 0.9578036665916443, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11088, "tokens_per_second_per_gpu": 10219.29, "total_tokens": 1094919620 }, { "epoch": 0.693235808952238, "grad_norm": 0.908166229724884, "learning_rate": 2e-05, "loss": 0.6037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11089, "tokens_per_second_per_gpu": 9576.54, "total_tokens": 1095013986 }, { "epoch": 0.6932983245811453, "grad_norm": 0.921342670917511, "learning_rate": 2e-05, "loss": 0.6318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11090, "tokens_per_second_per_gpu": 11230.08, "total_tokens": 1095114936 }, { "epoch": 0.6933608402100525, "grad_norm": 0.8968091607093811, "learning_rate": 2e-05, "loss": 0.6605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11091, "tokens_per_second_per_gpu": 10272.9, "total_tokens": 1095211337 }, { "epoch": 0.6934233558389598, "grad_norm": 0.9102872014045715, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11092, "tokens_per_second_per_gpu": 9811.77, "total_tokens": 1095309013 }, { "epoch": 0.693485871467867, "grad_norm": 0.8712159991264343, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11093, "tokens_per_second_per_gpu": 10852.71, "total_tokens": 1095408903 }, { "epoch": 0.6935483870967742, "grad_norm": 0.9294523000717163, "learning_rate": 2e-05, "loss": 0.6237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11094, "tokens_per_second_per_gpu": 10236.58, "total_tokens": 1095503837 }, { "epoch": 0.6936109027256814, "grad_norm": 0.9034932255744934, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11095, "tokens_per_second_per_gpu": 11274.09, "total_tokens": 1095603935 }, { "epoch": 0.6936734183545886, "grad_norm": 0.888340413570404, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11096, "tokens_per_second_per_gpu": 10059.34, "total_tokens": 1095703183 }, { "epoch": 0.6937359339834959, "grad_norm": 0.8831367492675781, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11097, "tokens_per_second_per_gpu": 10500.5, "total_tokens": 1095805512 }, { "epoch": 0.6937984496124031, "grad_norm": 0.8906134366989136, "learning_rate": 2e-05, "loss": 0.5838, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11098, "tokens_per_second_per_gpu": 10343.79, "total_tokens": 1095904134 }, { "epoch": 0.6938609652413104, "grad_norm": 0.9084038734436035, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11099, "tokens_per_second_per_gpu": 10845.5, "total_tokens": 1096003766 }, { "epoch": 0.6939234808702176, "grad_norm": 0.912778377532959, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11100, "tokens_per_second_per_gpu": 10431.57, "total_tokens": 1096100745 }, { "epoch": 0.6939859964991247, "grad_norm": 0.8728528022766113, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11101, "tokens_per_second_per_gpu": 10050.97, "total_tokens": 1096197306 }, { "epoch": 0.694048512128032, "grad_norm": 0.9324371218681335, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11102, "tokens_per_second_per_gpu": 10375.2, "total_tokens": 1096290987 }, { "epoch": 0.6941110277569392, "grad_norm": 0.8669212460517883, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11103, "tokens_per_second_per_gpu": 10805.36, "total_tokens": 1096391000 }, { "epoch": 0.6941735433858465, "grad_norm": 0.8878796696662903, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11104, "tokens_per_second_per_gpu": 10371.78, "total_tokens": 1096490352 }, { "epoch": 0.6942360590147537, "grad_norm": 0.8744856715202332, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11105, "tokens_per_second_per_gpu": 10623.4, "total_tokens": 1096591303 }, { "epoch": 0.694298574643661, "grad_norm": 0.8553615808486938, "learning_rate": 2e-05, "loss": 0.579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11106, "tokens_per_second_per_gpu": 9693.35, "total_tokens": 1096686176 }, { "epoch": 0.6943610902725681, "grad_norm": 0.8822534084320068, "learning_rate": 2e-05, "loss": 0.652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11107, "tokens_per_second_per_gpu": 10734.57, "total_tokens": 1096783772 }, { "epoch": 0.6944236059014753, "grad_norm": 0.8779612183570862, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11108, "tokens_per_second_per_gpu": 10537.84, "total_tokens": 1096881791 }, { "epoch": 0.6944861215303826, "grad_norm": 0.898181676864624, "learning_rate": 2e-05, "loss": 0.6022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11109, "tokens_per_second_per_gpu": 10190.76, "total_tokens": 1096982012 }, { "epoch": 0.6945486371592898, "grad_norm": 0.890213131904602, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11110, "tokens_per_second_per_gpu": 10415.92, "total_tokens": 1097079539 }, { "epoch": 0.6946111527881971, "grad_norm": 0.8593057990074158, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11111, "tokens_per_second_per_gpu": 10376.04, "total_tokens": 1097179078 }, { "epoch": 0.6946736684171043, "grad_norm": 0.9168479442596436, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11112, "tokens_per_second_per_gpu": 9821.23, "total_tokens": 1097276808 }, { "epoch": 0.6947361840460115, "grad_norm": 0.8847876787185669, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11113, "tokens_per_second_per_gpu": 10652.63, "total_tokens": 1097376635 }, { "epoch": 0.6947986996749187, "grad_norm": 0.8771119117736816, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11114, "tokens_per_second_per_gpu": 10919.43, "total_tokens": 1097474598 }, { "epoch": 0.6948612153038259, "grad_norm": 0.915301501750946, "learning_rate": 2e-05, "loss": 0.5913, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11115, "tokens_per_second_per_gpu": 9741.73, "total_tokens": 1097566762 }, { "epoch": 0.6949237309327332, "grad_norm": 0.8843123912811279, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11116, "tokens_per_second_per_gpu": 10633.39, "total_tokens": 1097663660 }, { "epoch": 0.6949862465616404, "grad_norm": 0.8986313939094543, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11117, "tokens_per_second_per_gpu": 10721.71, "total_tokens": 1097762193 }, { "epoch": 0.6950487621905477, "grad_norm": 0.9026349186897278, "learning_rate": 2e-05, "loss": 0.6561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11118, "tokens_per_second_per_gpu": 10337.97, "total_tokens": 1097859064 }, { "epoch": 0.6951112778194548, "grad_norm": 0.8214009404182434, "learning_rate": 2e-05, "loss": 0.5708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11119, "tokens_per_second_per_gpu": 10897.96, "total_tokens": 1097963037 }, { "epoch": 0.695173793448362, "grad_norm": 0.8745517134666443, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11120, "tokens_per_second_per_gpu": 10050.53, "total_tokens": 1098061071 }, { "epoch": 0.6952363090772693, "grad_norm": 0.8447365164756775, "learning_rate": 2e-05, "loss": 0.5901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11121, "tokens_per_second_per_gpu": 10213.79, "total_tokens": 1098159588 }, { "epoch": 0.6952988247061765, "grad_norm": 0.837993323802948, "learning_rate": 2e-05, "loss": 0.5966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11122, "tokens_per_second_per_gpu": 10347.43, "total_tokens": 1098261117 }, { "epoch": 0.6953613403350838, "grad_norm": 0.895391583442688, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11123, "tokens_per_second_per_gpu": 10837.64, "total_tokens": 1098360268 }, { "epoch": 0.695423855963991, "grad_norm": 0.8721409440040588, "learning_rate": 2e-05, "loss": 0.5935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11124, "tokens_per_second_per_gpu": 10619.0, "total_tokens": 1098459270 }, { "epoch": 0.6954863715928983, "grad_norm": 0.8683908581733704, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11125, "tokens_per_second_per_gpu": 10582.8, "total_tokens": 1098556244 }, { "epoch": 0.6955488872218054, "grad_norm": 0.8734541535377502, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11126, "tokens_per_second_per_gpu": 10410.49, "total_tokens": 1098656520 }, { "epoch": 0.6956114028507127, "grad_norm": 0.8617779016494751, "learning_rate": 2e-05, "loss": 0.5805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11127, "tokens_per_second_per_gpu": 9459.81, "total_tokens": 1098751157 }, { "epoch": 0.6956739184796199, "grad_norm": 0.8756853342056274, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11128, "tokens_per_second_per_gpu": 10405.69, "total_tokens": 1098850549 }, { "epoch": 0.6957364341085271, "grad_norm": 0.8991755247116089, "learning_rate": 2e-05, "loss": 0.5936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11129, "tokens_per_second_per_gpu": 10555.25, "total_tokens": 1098946409 }, { "epoch": 0.6957989497374344, "grad_norm": 0.8855982422828674, "learning_rate": 2e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11130, "tokens_per_second_per_gpu": 11097.12, "total_tokens": 1099048054 }, { "epoch": 0.6958614653663416, "grad_norm": 0.8931823372840881, "learning_rate": 2e-05, "loss": 0.5966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11131, "tokens_per_second_per_gpu": 10790.87, "total_tokens": 1099145955 }, { "epoch": 0.6959239809952488, "grad_norm": 0.8770608305931091, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11132, "tokens_per_second_per_gpu": 10113.63, "total_tokens": 1099246230 }, { "epoch": 0.695986496624156, "grad_norm": 0.8951683044433594, "learning_rate": 2e-05, "loss": 0.6716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11133, "tokens_per_second_per_gpu": 10263.5, "total_tokens": 1099342560 }, { "epoch": 0.6960490122530633, "grad_norm": 0.8948382139205933, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11134, "tokens_per_second_per_gpu": 10129.77, "total_tokens": 1099442291 }, { "epoch": 0.6961115278819705, "grad_norm": 0.8965258002281189, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11135, "tokens_per_second_per_gpu": 11090.43, "total_tokens": 1099546152 }, { "epoch": 0.6961740435108777, "grad_norm": 0.934386670589447, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11136, "tokens_per_second_per_gpu": 9572.48, "total_tokens": 1099643830 }, { "epoch": 0.696236559139785, "grad_norm": 0.8971810340881348, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11137, "tokens_per_second_per_gpu": 11092.61, "total_tokens": 1099743490 }, { "epoch": 0.6962990747686921, "grad_norm": 0.8583086729049683, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11138, "tokens_per_second_per_gpu": 11603.61, "total_tokens": 1099844851 }, { "epoch": 0.6963615903975994, "grad_norm": 0.8993425369262695, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11139, "tokens_per_second_per_gpu": 10347.77, "total_tokens": 1099941749 }, { "epoch": 0.6964241060265066, "grad_norm": 0.8830394148826599, "learning_rate": 2e-05, "loss": 0.5894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11140, "tokens_per_second_per_gpu": 10731.68, "total_tokens": 1100040631 }, { "epoch": 0.6964866216554139, "grad_norm": 0.843674898147583, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11141, "tokens_per_second_per_gpu": 10856.06, "total_tokens": 1100144144 }, { "epoch": 0.6965491372843211, "grad_norm": 0.9080104827880859, "learning_rate": 2e-05, "loss": 0.6602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11142, "tokens_per_second_per_gpu": 10907.96, "total_tokens": 1100244305 }, { "epoch": 0.6966116529132284, "grad_norm": 0.8954107761383057, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11143, "tokens_per_second_per_gpu": 10786.92, "total_tokens": 1100345837 }, { "epoch": 0.6966741685421355, "grad_norm": 0.86603182554245, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11144, "tokens_per_second_per_gpu": 11005.21, "total_tokens": 1100446859 }, { "epoch": 0.6967366841710427, "grad_norm": 0.8918524980545044, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11145, "tokens_per_second_per_gpu": 10842.16, "total_tokens": 1100546429 }, { "epoch": 0.69679919979995, "grad_norm": 0.8994972109794617, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11146, "tokens_per_second_per_gpu": 9868.24, "total_tokens": 1100645789 }, { "epoch": 0.6968617154288572, "grad_norm": 0.8688070178031921, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11147, "tokens_per_second_per_gpu": 10566.85, "total_tokens": 1100747036 }, { "epoch": 0.6969242310577645, "grad_norm": 0.9321884512901306, "learning_rate": 2e-05, "loss": 0.6823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11148, "tokens_per_second_per_gpu": 11246.49, "total_tokens": 1100848977 }, { "epoch": 0.6969867466866717, "grad_norm": 0.896038293838501, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11149, "tokens_per_second_per_gpu": 10086.13, "total_tokens": 1100949475 }, { "epoch": 0.6970492623155788, "grad_norm": 0.8604966998100281, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11150, "tokens_per_second_per_gpu": 10478.31, "total_tokens": 1101051076 }, { "epoch": 0.6971117779444861, "grad_norm": 0.8922577500343323, "learning_rate": 2e-05, "loss": 0.5848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11151, "tokens_per_second_per_gpu": 10575.27, "total_tokens": 1101149429 }, { "epoch": 0.6971742935733933, "grad_norm": 0.8412957787513733, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11152, "tokens_per_second_per_gpu": 10636.14, "total_tokens": 1101248810 }, { "epoch": 0.6972368092023006, "grad_norm": 0.894646942615509, "learning_rate": 2e-05, "loss": 0.6009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11153, "tokens_per_second_per_gpu": 9740.8, "total_tokens": 1101340838 }, { "epoch": 0.6972993248312078, "grad_norm": 0.8653839230537415, "learning_rate": 2e-05, "loss": 0.5906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11154, "tokens_per_second_per_gpu": 9927.72, "total_tokens": 1101437029 }, { "epoch": 0.6973618404601151, "grad_norm": 0.8877037763595581, "learning_rate": 2e-05, "loss": 0.6534, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11155, "tokens_per_second_per_gpu": 10984.46, "total_tokens": 1101540513 }, { "epoch": 0.6974243560890222, "grad_norm": 0.895142674446106, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11156, "tokens_per_second_per_gpu": 9762.74, "total_tokens": 1101637757 }, { "epoch": 0.6974868717179294, "grad_norm": 0.9252777695655823, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11157, "tokens_per_second_per_gpu": 10107.35, "total_tokens": 1101737791 }, { "epoch": 0.6975493873468367, "grad_norm": 0.8818933963775635, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11158, "tokens_per_second_per_gpu": 10406.08, "total_tokens": 1101835568 }, { "epoch": 0.6976119029757439, "grad_norm": 0.8547999858856201, "learning_rate": 2e-05, "loss": 0.6997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11159, "tokens_per_second_per_gpu": 11492.81, "total_tokens": 1101941588 }, { "epoch": 0.6976744186046512, "grad_norm": 0.8746742010116577, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11160, "tokens_per_second_per_gpu": 11005.41, "total_tokens": 1102041091 }, { "epoch": 0.6977369342335584, "grad_norm": 0.9046124815940857, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11161, "tokens_per_second_per_gpu": 10336.43, "total_tokens": 1102139225 }, { "epoch": 0.6977994498624657, "grad_norm": 0.8253856301307678, "learning_rate": 2e-05, "loss": 0.6086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11162, "tokens_per_second_per_gpu": 10345.93, "total_tokens": 1102240518 }, { "epoch": 0.6978619654913728, "grad_norm": 0.9581727981567383, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11163, "tokens_per_second_per_gpu": 10315.22, "total_tokens": 1102337949 }, { "epoch": 0.69792448112028, "grad_norm": 0.8931707739830017, "learning_rate": 2e-05, "loss": 0.5929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11164, "tokens_per_second_per_gpu": 10006.31, "total_tokens": 1102433135 }, { "epoch": 0.6979869967491873, "grad_norm": 0.8712050914764404, "learning_rate": 2e-05, "loss": 0.5868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11165, "tokens_per_second_per_gpu": 10013.74, "total_tokens": 1102531417 }, { "epoch": 0.6980495123780945, "grad_norm": 0.8539775013923645, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11166, "tokens_per_second_per_gpu": 10843.68, "total_tokens": 1102630978 }, { "epoch": 0.6981120280070018, "grad_norm": 0.8276644349098206, "learning_rate": 2e-05, "loss": 0.5619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11167, "tokens_per_second_per_gpu": 10446.24, "total_tokens": 1102728630 }, { "epoch": 0.698174543635909, "grad_norm": 0.8766695261001587, "learning_rate": 2e-05, "loss": 0.6036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11168, "tokens_per_second_per_gpu": 10869.89, "total_tokens": 1102828215 }, { "epoch": 0.6982370592648162, "grad_norm": 0.8467487692832947, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11169, "tokens_per_second_per_gpu": 10822.98, "total_tokens": 1102928535 }, { "epoch": 0.6982995748937234, "grad_norm": 0.9150669574737549, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11170, "tokens_per_second_per_gpu": 10961.13, "total_tokens": 1103027483 }, { "epoch": 0.6983620905226307, "grad_norm": 0.848293662071228, "learning_rate": 2e-05, "loss": 0.6237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11171, "tokens_per_second_per_gpu": 9977.25, "total_tokens": 1103125975 }, { "epoch": 0.6984246061515379, "grad_norm": 0.8878340721130371, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11172, "tokens_per_second_per_gpu": 10345.03, "total_tokens": 1103222868 }, { "epoch": 0.6984871217804451, "grad_norm": 0.9134900569915771, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11173, "tokens_per_second_per_gpu": 10519.56, "total_tokens": 1103319661 }, { "epoch": 0.6985496374093524, "grad_norm": 0.8769031763076782, "learning_rate": 2e-05, "loss": 0.5742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11174, "tokens_per_second_per_gpu": 10321.15, "total_tokens": 1103413299 }, { "epoch": 0.6986121530382595, "grad_norm": 0.8619344830513, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11175, "tokens_per_second_per_gpu": 10478.23, "total_tokens": 1103515726 }, { "epoch": 0.6986746686671668, "grad_norm": 0.8842331767082214, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11176, "tokens_per_second_per_gpu": 10749.49, "total_tokens": 1103618227 }, { "epoch": 0.698737184296074, "grad_norm": 0.912373423576355, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11177, "tokens_per_second_per_gpu": 11215.45, "total_tokens": 1103719760 }, { "epoch": 0.6987996999249813, "grad_norm": 0.924071192741394, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11178, "tokens_per_second_per_gpu": 10694.07, "total_tokens": 1103815527 }, { "epoch": 0.6988622155538885, "grad_norm": 0.8864333629608154, "learning_rate": 2e-05, "loss": 0.5862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11179, "tokens_per_second_per_gpu": 9811.8, "total_tokens": 1103910398 }, { "epoch": 0.6989247311827957, "grad_norm": 0.9098149538040161, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11180, "tokens_per_second_per_gpu": 10620.67, "total_tokens": 1104011061 }, { "epoch": 0.6989872468117029, "grad_norm": 0.8813409805297852, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11181, "tokens_per_second_per_gpu": 14159.53, "total_tokens": 1104110993 }, { "epoch": 0.6990497624406101, "grad_norm": 0.8972395658493042, "learning_rate": 2e-05, "loss": 0.6057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11182, "tokens_per_second_per_gpu": 10740.77, "total_tokens": 1104210561 }, { "epoch": 0.6991122780695174, "grad_norm": 0.877338707447052, "learning_rate": 2e-05, "loss": 0.6777, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11183, "tokens_per_second_per_gpu": 11695.74, "total_tokens": 1104316541 }, { "epoch": 0.6991747936984246, "grad_norm": 0.9006454348564148, "learning_rate": 2e-05, "loss": 0.6255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11184, "tokens_per_second_per_gpu": 10999.45, "total_tokens": 1104413196 }, { "epoch": 0.6992373093273319, "grad_norm": 0.8850445747375488, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11185, "tokens_per_second_per_gpu": 10376.21, "total_tokens": 1104511585 }, { "epoch": 0.6992998249562391, "grad_norm": 0.8593134880065918, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11186, "tokens_per_second_per_gpu": 10864.49, "total_tokens": 1104614594 }, { "epoch": 0.6993623405851462, "grad_norm": 0.9240270256996155, "learning_rate": 2e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11187, "tokens_per_second_per_gpu": 11097.91, "total_tokens": 1104716125 }, { "epoch": 0.6994248562140535, "grad_norm": 0.851499617099762, "learning_rate": 2e-05, "loss": 0.5988, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11188, "tokens_per_second_per_gpu": 10949.54, "total_tokens": 1104817326 }, { "epoch": 0.6994873718429607, "grad_norm": 0.9520785212516785, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11189, "tokens_per_second_per_gpu": 10442.87, "total_tokens": 1104915636 }, { "epoch": 0.699549887471868, "grad_norm": 0.9257386922836304, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11190, "tokens_per_second_per_gpu": 9953.27, "total_tokens": 1105014437 }, { "epoch": 0.6996124031007752, "grad_norm": 0.8650333285331726, "learning_rate": 2e-05, "loss": 0.5934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11191, "tokens_per_second_per_gpu": 9674.64, "total_tokens": 1105112591 }, { "epoch": 0.6996749187296825, "grad_norm": 0.8554652333259583, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11192, "tokens_per_second_per_gpu": 10288.86, "total_tokens": 1105209980 }, { "epoch": 0.6997374343585896, "grad_norm": 0.9153255820274353, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11193, "tokens_per_second_per_gpu": 10859.31, "total_tokens": 1105309900 }, { "epoch": 0.6997999499874968, "grad_norm": 0.8670977354049683, "learning_rate": 2e-05, "loss": 0.6004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11194, "tokens_per_second_per_gpu": 10430.79, "total_tokens": 1105407851 }, { "epoch": 0.6998624656164041, "grad_norm": 0.8723688125610352, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11195, "tokens_per_second_per_gpu": 10331.99, "total_tokens": 1105504998 }, { "epoch": 0.6999249812453113, "grad_norm": 0.8438317775726318, "learning_rate": 2e-05, "loss": 0.6286, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11196, "tokens_per_second_per_gpu": 11652.8, "total_tokens": 1105609237 }, { "epoch": 0.6999874968742186, "grad_norm": 0.9271538853645325, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11197, "tokens_per_second_per_gpu": 10489.72, "total_tokens": 1105707401 }, { "epoch": 0.7000500125031258, "grad_norm": 0.8806109428405762, "learning_rate": 2e-05, "loss": 0.7114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11198, "tokens_per_second_per_gpu": 11345.25, "total_tokens": 1105810307 }, { "epoch": 0.7001125281320331, "grad_norm": 0.9023105502128601, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11199, "tokens_per_second_per_gpu": 10416.36, "total_tokens": 1105909449 }, { "epoch": 0.7001750437609402, "grad_norm": 0.883668065071106, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11200, "tokens_per_second_per_gpu": 11353.63, "total_tokens": 1106010965 }, { "epoch": 0.7002375593898474, "grad_norm": 0.9221580624580383, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11201, "tokens_per_second_per_gpu": 9959.73, "total_tokens": 1106104116 }, { "epoch": 0.7003000750187547, "grad_norm": 0.8963910937309265, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11202, "tokens_per_second_per_gpu": 9433.58, "total_tokens": 1106197538 }, { "epoch": 0.7003625906476619, "grad_norm": 0.9731951355934143, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11203, "tokens_per_second_per_gpu": 10509.27, "total_tokens": 1106296023 }, { "epoch": 0.7004251062765692, "grad_norm": 0.8958086371421814, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11204, "tokens_per_second_per_gpu": 11252.11, "total_tokens": 1106398928 }, { "epoch": 0.7004876219054764, "grad_norm": 0.8786727786064148, "learning_rate": 2e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11205, "tokens_per_second_per_gpu": 10949.01, "total_tokens": 1106498862 }, { "epoch": 0.7005501375343836, "grad_norm": 0.8725708723068237, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11206, "tokens_per_second_per_gpu": 11333.24, "total_tokens": 1106601309 }, { "epoch": 0.7006126531632908, "grad_norm": 0.9193137288093567, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11207, "tokens_per_second_per_gpu": 10023.23, "total_tokens": 1106694401 }, { "epoch": 0.700675168792198, "grad_norm": 0.9485359191894531, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11208, "tokens_per_second_per_gpu": 11000.11, "total_tokens": 1106796534 }, { "epoch": 0.7007376844211053, "grad_norm": 0.9021811485290527, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11209, "tokens_per_second_per_gpu": 10795.75, "total_tokens": 1106896656 }, { "epoch": 0.7008002000500125, "grad_norm": 0.9102077484130859, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11210, "tokens_per_second_per_gpu": 10489.63, "total_tokens": 1106995134 }, { "epoch": 0.7008627156789198, "grad_norm": 0.8668289184570312, "learning_rate": 2e-05, "loss": 0.5962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11211, "tokens_per_second_per_gpu": 10195.33, "total_tokens": 1107096683 }, { "epoch": 0.7009252313078269, "grad_norm": 0.9263255596160889, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11212, "tokens_per_second_per_gpu": 10690.9, "total_tokens": 1107196948 }, { "epoch": 0.7009877469367342, "grad_norm": 0.9340075254440308, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11213, "tokens_per_second_per_gpu": 11400.6, "total_tokens": 1107297058 }, { "epoch": 0.7010502625656414, "grad_norm": 0.9195365309715271, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11214, "tokens_per_second_per_gpu": 10475.28, "total_tokens": 1107395903 }, { "epoch": 0.7011127781945486, "grad_norm": 0.8494018316268921, "learning_rate": 2e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11215, "tokens_per_second_per_gpu": 11229.95, "total_tokens": 1107501489 }, { "epoch": 0.7011752938234559, "grad_norm": 0.8858568072319031, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11216, "tokens_per_second_per_gpu": 10900.58, "total_tokens": 1107603626 }, { "epoch": 0.7012378094523631, "grad_norm": 0.8801355361938477, "learning_rate": 2e-05, "loss": 0.6686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11217, "tokens_per_second_per_gpu": 10753.63, "total_tokens": 1107705759 }, { "epoch": 0.7013003250812703, "grad_norm": 0.8742863535881042, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11218, "tokens_per_second_per_gpu": 10736.65, "total_tokens": 1107806370 }, { "epoch": 0.7013628407101775, "grad_norm": 0.9484304189682007, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11219, "tokens_per_second_per_gpu": 9976.75, "total_tokens": 1107900436 }, { "epoch": 0.7014253563390848, "grad_norm": 0.9576755166053772, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11220, "tokens_per_second_per_gpu": 9992.39, "total_tokens": 1107997435 }, { "epoch": 0.701487871967992, "grad_norm": 0.8857638835906982, "learning_rate": 2e-05, "loss": 0.6066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11221, "tokens_per_second_per_gpu": 10668.41, "total_tokens": 1108092939 }, { "epoch": 0.7015503875968992, "grad_norm": 0.9031539559364319, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11222, "tokens_per_second_per_gpu": 11100.81, "total_tokens": 1108193494 }, { "epoch": 0.7016129032258065, "grad_norm": 0.8873246312141418, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11223, "tokens_per_second_per_gpu": 9971.82, "total_tokens": 1108292070 }, { "epoch": 0.7016754188547136, "grad_norm": 0.9057042598724365, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11224, "tokens_per_second_per_gpu": 10725.85, "total_tokens": 1108388939 }, { "epoch": 0.7017379344836209, "grad_norm": 0.8744062185287476, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11225, "tokens_per_second_per_gpu": 10271.15, "total_tokens": 1108491865 }, { "epoch": 0.7018004501125281, "grad_norm": 0.9251624941825867, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11226, "tokens_per_second_per_gpu": 10558.97, "total_tokens": 1108589661 }, { "epoch": 0.7018629657414354, "grad_norm": 0.8876326084136963, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11227, "tokens_per_second_per_gpu": 9952.03, "total_tokens": 1108688171 }, { "epoch": 0.7019254813703426, "grad_norm": 0.8554581999778748, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11228, "tokens_per_second_per_gpu": 10902.9, "total_tokens": 1108791012 }, { "epoch": 0.7019879969992499, "grad_norm": 0.8855634331703186, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11229, "tokens_per_second_per_gpu": 10830.46, "total_tokens": 1108891288 }, { "epoch": 0.702050512628157, "grad_norm": 0.9006768465042114, "learning_rate": 2e-05, "loss": 0.6677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11230, "tokens_per_second_per_gpu": 10418.35, "total_tokens": 1108989802 }, { "epoch": 0.7021130282570642, "grad_norm": 0.9059628248214722, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11231, "tokens_per_second_per_gpu": 10057.45, "total_tokens": 1109087367 }, { "epoch": 0.7021755438859715, "grad_norm": 0.8955137729644775, "learning_rate": 2e-05, "loss": 0.5878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11232, "tokens_per_second_per_gpu": 10507.07, "total_tokens": 1109186482 }, { "epoch": 0.7022380595148787, "grad_norm": 0.9111484885215759, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11233, "tokens_per_second_per_gpu": 10623.02, "total_tokens": 1109285537 }, { "epoch": 0.702300575143786, "grad_norm": 0.8926054835319519, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11234, "tokens_per_second_per_gpu": 10361.34, "total_tokens": 1109384325 }, { "epoch": 0.7023630907726932, "grad_norm": 0.8647673726081848, "learning_rate": 2e-05, "loss": 0.5832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11235, "tokens_per_second_per_gpu": 9698.48, "total_tokens": 1109479229 }, { "epoch": 0.7024256064016005, "grad_norm": 0.9140294790267944, "learning_rate": 2e-05, "loss": 0.6004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11236, "tokens_per_second_per_gpu": 10779.07, "total_tokens": 1109579625 }, { "epoch": 0.7024881220305076, "grad_norm": 0.9127988219261169, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11237, "tokens_per_second_per_gpu": 10034.74, "total_tokens": 1109680835 }, { "epoch": 0.7025506376594148, "grad_norm": 0.9711707234382629, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11238, "tokens_per_second_per_gpu": 9819.15, "total_tokens": 1109774692 }, { "epoch": 0.7026131532883221, "grad_norm": 0.8809818625450134, "learning_rate": 2e-05, "loss": 0.5705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11239, "tokens_per_second_per_gpu": 10588.07, "total_tokens": 1109869433 }, { "epoch": 0.7026756689172293, "grad_norm": 0.8638799786567688, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11240, "tokens_per_second_per_gpu": 10809.07, "total_tokens": 1109968304 }, { "epoch": 0.7027381845461366, "grad_norm": 0.9202423691749573, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11241, "tokens_per_second_per_gpu": 9593.06, "total_tokens": 1110061998 }, { "epoch": 0.7028007001750438, "grad_norm": 0.8906283378601074, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11242, "tokens_per_second_per_gpu": 9386.34, "total_tokens": 1110158446 }, { "epoch": 0.702863215803951, "grad_norm": 0.8766936659812927, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11243, "tokens_per_second_per_gpu": 9967.14, "total_tokens": 1110254512 }, { "epoch": 0.7029257314328582, "grad_norm": 0.863767683506012, "learning_rate": 2e-05, "loss": 0.5933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11244, "tokens_per_second_per_gpu": 10593.94, "total_tokens": 1110350979 }, { "epoch": 0.7029882470617654, "grad_norm": 0.8705213665962219, "learning_rate": 2e-05, "loss": 0.6601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11245, "tokens_per_second_per_gpu": 10638.89, "total_tokens": 1110451709 }, { "epoch": 0.7030507626906727, "grad_norm": 0.9010369777679443, "learning_rate": 2e-05, "loss": 0.6039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11246, "tokens_per_second_per_gpu": 10620.42, "total_tokens": 1110549697 }, { "epoch": 0.7031132783195799, "grad_norm": 0.9240472912788391, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11247, "tokens_per_second_per_gpu": 9966.8, "total_tokens": 1110646916 }, { "epoch": 0.7031757939484872, "grad_norm": 0.9058580994606018, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11248, "tokens_per_second_per_gpu": 9722.85, "total_tokens": 1110743148 }, { "epoch": 0.7032383095773943, "grad_norm": 0.8700737953186035, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11249, "tokens_per_second_per_gpu": 10717.31, "total_tokens": 1110842990 }, { "epoch": 0.7033008252063015, "grad_norm": 0.935356616973877, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11250, "tokens_per_second_per_gpu": 10600.03, "total_tokens": 1110942308 }, { "epoch": 0.7033633408352088, "grad_norm": 0.9049314260482788, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11251, "tokens_per_second_per_gpu": 9964.04, "total_tokens": 1111040871 }, { "epoch": 0.703425856464116, "grad_norm": 0.8976064920425415, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11252, "tokens_per_second_per_gpu": 10799.84, "total_tokens": 1111141378 }, { "epoch": 0.7034883720930233, "grad_norm": 0.9136959314346313, "learning_rate": 2e-05, "loss": 0.6996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11253, "tokens_per_second_per_gpu": 10521.33, "total_tokens": 1111243522 }, { "epoch": 0.7035508877219305, "grad_norm": 0.8477944135665894, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11254, "tokens_per_second_per_gpu": 11071.53, "total_tokens": 1111343216 }, { "epoch": 0.7036134033508377, "grad_norm": 0.9091095328330994, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11255, "tokens_per_second_per_gpu": 10142.2, "total_tokens": 1111440606 }, { "epoch": 0.7036759189797449, "grad_norm": 0.8766590356826782, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11256, "tokens_per_second_per_gpu": 10166.35, "total_tokens": 1111539601 }, { "epoch": 0.7037384346086522, "grad_norm": 0.8689647912979126, "learning_rate": 2e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11257, "tokens_per_second_per_gpu": 10546.82, "total_tokens": 1111637991 }, { "epoch": 0.7038009502375594, "grad_norm": 0.8712821006774902, "learning_rate": 2e-05, "loss": 0.6572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11258, "tokens_per_second_per_gpu": 10299.97, "total_tokens": 1111737668 }, { "epoch": 0.7038634658664666, "grad_norm": 0.8787989020347595, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11259, "tokens_per_second_per_gpu": 10528.21, "total_tokens": 1111835906 }, { "epoch": 0.7039259814953739, "grad_norm": 0.9322484135627747, "learning_rate": 2e-05, "loss": 0.5883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11260, "tokens_per_second_per_gpu": 9557.8, "total_tokens": 1111930101 }, { "epoch": 0.703988497124281, "grad_norm": 0.9224986433982849, "learning_rate": 2e-05, "loss": 0.599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11261, "tokens_per_second_per_gpu": 10575.27, "total_tokens": 1112029684 }, { "epoch": 0.7040510127531883, "grad_norm": 0.9731825590133667, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11262, "tokens_per_second_per_gpu": 10205.23, "total_tokens": 1112126923 }, { "epoch": 0.7041135283820955, "grad_norm": 0.8861408829689026, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11263, "tokens_per_second_per_gpu": 10746.63, "total_tokens": 1112228032 }, { "epoch": 0.7041760440110028, "grad_norm": 0.9054393172264099, "learning_rate": 2e-05, "loss": 0.5709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11264, "tokens_per_second_per_gpu": 9552.57, "total_tokens": 1112323295 }, { "epoch": 0.70423855963991, "grad_norm": 0.8842883110046387, "learning_rate": 2e-05, "loss": 0.6283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11265, "tokens_per_second_per_gpu": 9849.5, "total_tokens": 1112416273 }, { "epoch": 0.7043010752688172, "grad_norm": 1.0449472665786743, "learning_rate": 2e-05, "loss": 0.6517, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11266, "tokens_per_second_per_gpu": 10811.96, "total_tokens": 1112514593 }, { "epoch": 0.7043635908977244, "grad_norm": 0.9084378480911255, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11267, "tokens_per_second_per_gpu": 10207.5, "total_tokens": 1112613532 }, { "epoch": 0.7044261065266316, "grad_norm": 0.903613269329071, "learning_rate": 2e-05, "loss": 0.6647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11268, "tokens_per_second_per_gpu": 10858.7, "total_tokens": 1112715233 }, { "epoch": 0.7044886221555389, "grad_norm": 0.9122767448425293, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11269, "tokens_per_second_per_gpu": 10190.56, "total_tokens": 1112810054 }, { "epoch": 0.7045511377844461, "grad_norm": 0.8618295192718506, "learning_rate": 2e-05, "loss": 0.6673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11270, "tokens_per_second_per_gpu": 10687.14, "total_tokens": 1112913506 }, { "epoch": 0.7046136534133534, "grad_norm": 0.9081927537918091, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11271, "tokens_per_second_per_gpu": 10067.49, "total_tokens": 1113007854 }, { "epoch": 0.7046761690422606, "grad_norm": 0.9008407592773438, "learning_rate": 2e-05, "loss": 0.6541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11272, "tokens_per_second_per_gpu": 9837.69, "total_tokens": 1113106818 }, { "epoch": 0.7047386846711677, "grad_norm": 0.9263085722923279, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11273, "tokens_per_second_per_gpu": 10986.41, "total_tokens": 1113204035 }, { "epoch": 0.704801200300075, "grad_norm": 0.9071946740150452, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11274, "tokens_per_second_per_gpu": 9769.5, "total_tokens": 1113297901 }, { "epoch": 0.7048637159289822, "grad_norm": 0.8883011341094971, "learning_rate": 2e-05, "loss": 0.6535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11275, "tokens_per_second_per_gpu": 10585.86, "total_tokens": 1113394721 }, { "epoch": 0.7049262315578895, "grad_norm": 0.93870609998703, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11276, "tokens_per_second_per_gpu": 10898.33, "total_tokens": 1113494480 }, { "epoch": 0.7049887471867967, "grad_norm": 0.8977581262588501, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11277, "tokens_per_second_per_gpu": 10311.08, "total_tokens": 1113592511 }, { "epoch": 0.705051262815704, "grad_norm": 0.9371375441551208, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11278, "tokens_per_second_per_gpu": 10860.39, "total_tokens": 1113693860 }, { "epoch": 0.7051137784446112, "grad_norm": 0.9109902381896973, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11279, "tokens_per_second_per_gpu": 10083.86, "total_tokens": 1113788861 }, { "epoch": 0.7051762940735183, "grad_norm": 0.8897942900657654, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11280, "tokens_per_second_per_gpu": 10395.86, "total_tokens": 1113885804 }, { "epoch": 0.7052388097024256, "grad_norm": 0.8969079256057739, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11281, "tokens_per_second_per_gpu": 10691.96, "total_tokens": 1113984803 }, { "epoch": 0.7053013253313328, "grad_norm": 0.9184076189994812, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11282, "tokens_per_second_per_gpu": 10850.48, "total_tokens": 1114084913 }, { "epoch": 0.7053638409602401, "grad_norm": 0.8847569227218628, "learning_rate": 2e-05, "loss": 0.6016, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11283, "tokens_per_second_per_gpu": 10943.74, "total_tokens": 1114182770 }, { "epoch": 0.7054263565891473, "grad_norm": 0.9014648795127869, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11284, "tokens_per_second_per_gpu": 10802.65, "total_tokens": 1114282603 }, { "epoch": 0.7054888722180546, "grad_norm": 0.9107537269592285, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11285, "tokens_per_second_per_gpu": 10454.25, "total_tokens": 1114380080 }, { "epoch": 0.7055513878469617, "grad_norm": 0.9312154054641724, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11286, "tokens_per_second_per_gpu": 10610.25, "total_tokens": 1114475225 }, { "epoch": 0.7056139034758689, "grad_norm": 0.8753261566162109, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11287, "tokens_per_second_per_gpu": 9707.95, "total_tokens": 1114575562 }, { "epoch": 0.7056764191047762, "grad_norm": 0.9135144948959351, "learning_rate": 2e-05, "loss": 0.6613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11288, "tokens_per_second_per_gpu": 10887.8, "total_tokens": 1114676393 }, { "epoch": 0.7057389347336834, "grad_norm": 0.8719343543052673, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11289, "tokens_per_second_per_gpu": 10293.77, "total_tokens": 1114776072 }, { "epoch": 0.7058014503625907, "grad_norm": 0.9203583002090454, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11290, "tokens_per_second_per_gpu": 11063.95, "total_tokens": 1114875651 }, { "epoch": 0.7058639659914979, "grad_norm": 0.8804628849029541, "learning_rate": 2e-05, "loss": 0.5906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11291, "tokens_per_second_per_gpu": 10738.63, "total_tokens": 1114974473 }, { "epoch": 0.705926481620405, "grad_norm": 0.9030183553695679, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11292, "tokens_per_second_per_gpu": 10919.44, "total_tokens": 1115073110 }, { "epoch": 0.7059889972493123, "grad_norm": 0.928688645362854, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11293, "tokens_per_second_per_gpu": 10547.62, "total_tokens": 1115169557 }, { "epoch": 0.7060515128782195, "grad_norm": 0.8936448693275452, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11294, "tokens_per_second_per_gpu": 10141.06, "total_tokens": 1115270179 }, { "epoch": 0.7061140285071268, "grad_norm": 0.8857676982879639, "learning_rate": 2e-05, "loss": 0.6714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11295, "tokens_per_second_per_gpu": 10380.72, "total_tokens": 1115368904 }, { "epoch": 0.706176544136034, "grad_norm": 0.9000554084777832, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11296, "tokens_per_second_per_gpu": 10454.67, "total_tokens": 1115465563 }, { "epoch": 0.7062390597649413, "grad_norm": 0.8961736559867859, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11297, "tokens_per_second_per_gpu": 11040.13, "total_tokens": 1115566212 }, { "epoch": 0.7063015753938484, "grad_norm": 0.8896161913871765, "learning_rate": 2e-05, "loss": 0.5834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11298, "tokens_per_second_per_gpu": 10188.19, "total_tokens": 1115660547 }, { "epoch": 0.7063640910227557, "grad_norm": 0.8924298286437988, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11299, "tokens_per_second_per_gpu": 10708.13, "total_tokens": 1115755431 }, { "epoch": 0.7064266066516629, "grad_norm": 0.8876916766166687, "learning_rate": 2e-05, "loss": 0.672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11300, "tokens_per_second_per_gpu": 9978.77, "total_tokens": 1115854846 }, { "epoch": 0.7064891222805701, "grad_norm": 0.9014532566070557, "learning_rate": 2e-05, "loss": 0.6458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11301, "tokens_per_second_per_gpu": 10132.64, "total_tokens": 1115950871 }, { "epoch": 0.7065516379094774, "grad_norm": 0.891464352607727, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11302, "tokens_per_second_per_gpu": 10948.46, "total_tokens": 1116048599 }, { "epoch": 0.7066141535383846, "grad_norm": 0.8752882480621338, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11303, "tokens_per_second_per_gpu": 10694.81, "total_tokens": 1116147270 }, { "epoch": 0.7066766691672918, "grad_norm": 0.8432217240333557, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11304, "tokens_per_second_per_gpu": 10551.51, "total_tokens": 1116249073 }, { "epoch": 0.706739184796199, "grad_norm": 0.9014232754707336, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11305, "tokens_per_second_per_gpu": 10486.37, "total_tokens": 1116346049 }, { "epoch": 0.7068017004251063, "grad_norm": 0.8951859474182129, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11306, "tokens_per_second_per_gpu": 10559.0, "total_tokens": 1116445254 }, { "epoch": 0.7068642160540135, "grad_norm": 0.8559538722038269, "learning_rate": 2e-05, "loss": 0.611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11307, "tokens_per_second_per_gpu": 10243.03, "total_tokens": 1116545295 }, { "epoch": 0.7069267316829208, "grad_norm": 0.8633821606636047, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11308, "tokens_per_second_per_gpu": 10732.34, "total_tokens": 1116644747 }, { "epoch": 0.706989247311828, "grad_norm": 0.8628137111663818, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11309, "tokens_per_second_per_gpu": 11019.7, "total_tokens": 1116747327 }, { "epoch": 0.7070517629407351, "grad_norm": 0.8884209990501404, "learning_rate": 2e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11310, "tokens_per_second_per_gpu": 10055.68, "total_tokens": 1116841467 }, { "epoch": 0.7071142785696424, "grad_norm": 0.8838372826576233, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11311, "tokens_per_second_per_gpu": 10522.7, "total_tokens": 1116940488 }, { "epoch": 0.7071767941985496, "grad_norm": 0.9056261777877808, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11312, "tokens_per_second_per_gpu": 10313.57, "total_tokens": 1117034657 }, { "epoch": 0.7072393098274569, "grad_norm": 0.9208064675331116, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11313, "tokens_per_second_per_gpu": 10184.34, "total_tokens": 1117129907 }, { "epoch": 0.7073018254563641, "grad_norm": 0.9067234396934509, "learning_rate": 2e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11314, "tokens_per_second_per_gpu": 10259.96, "total_tokens": 1117228762 }, { "epoch": 0.7073643410852714, "grad_norm": 0.8882044553756714, "learning_rate": 2e-05, "loss": 0.663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11315, "tokens_per_second_per_gpu": 10774.45, "total_tokens": 1117327372 }, { "epoch": 0.7074268567141786, "grad_norm": 0.8925564885139465, "learning_rate": 2e-05, "loss": 0.6059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11316, "tokens_per_second_per_gpu": 10492.28, "total_tokens": 1117422724 }, { "epoch": 0.7074893723430857, "grad_norm": 0.8778972625732422, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11317, "tokens_per_second_per_gpu": 10274.55, "total_tokens": 1117523226 }, { "epoch": 0.707551887971993, "grad_norm": 0.9050849080085754, "learning_rate": 2e-05, "loss": 0.6588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11318, "tokens_per_second_per_gpu": 10374.34, "total_tokens": 1117621462 }, { "epoch": 0.7076144036009002, "grad_norm": 0.8658629059791565, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11319, "tokens_per_second_per_gpu": 10736.26, "total_tokens": 1117721979 }, { "epoch": 0.7076769192298075, "grad_norm": 0.8706426024436951, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11320, "tokens_per_second_per_gpu": 11236.74, "total_tokens": 1117824029 }, { "epoch": 0.7077394348587147, "grad_norm": 0.8513400554656982, "learning_rate": 2e-05, "loss": 0.5827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11321, "tokens_per_second_per_gpu": 10617.78, "total_tokens": 1117921378 }, { "epoch": 0.707801950487622, "grad_norm": 0.896664023399353, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11322, "tokens_per_second_per_gpu": 10764.29, "total_tokens": 1118019128 }, { "epoch": 0.7078644661165291, "grad_norm": 0.8767240047454834, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11323, "tokens_per_second_per_gpu": 10417.56, "total_tokens": 1118116755 }, { "epoch": 0.7079269817454363, "grad_norm": 0.8918823003768921, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11324, "tokens_per_second_per_gpu": 10220.86, "total_tokens": 1118210032 }, { "epoch": 0.7079894973743436, "grad_norm": 0.8531631231307983, "learning_rate": 2e-05, "loss": 0.5947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11325, "tokens_per_second_per_gpu": 11066.0, "total_tokens": 1118309661 }, { "epoch": 0.7080520130032508, "grad_norm": 0.8974096179008484, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11326, "tokens_per_second_per_gpu": 10757.74, "total_tokens": 1118408318 }, { "epoch": 0.7081145286321581, "grad_norm": 0.8863608837127686, "learning_rate": 2e-05, "loss": 0.6515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11327, "tokens_per_second_per_gpu": 11735.06, "total_tokens": 1118511282 }, { "epoch": 0.7081770442610653, "grad_norm": 0.8787063360214233, "learning_rate": 2e-05, "loss": 0.6514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11328, "tokens_per_second_per_gpu": 11222.49, "total_tokens": 1118611781 }, { "epoch": 0.7082395598899724, "grad_norm": 0.8855072855949402, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11329, "tokens_per_second_per_gpu": 9070.93, "total_tokens": 1118705629 }, { "epoch": 0.7083020755188797, "grad_norm": 0.8569707274436951, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11330, "tokens_per_second_per_gpu": 10010.25, "total_tokens": 1118808233 }, { "epoch": 0.7083645911477869, "grad_norm": 0.8850457072257996, "learning_rate": 2e-05, "loss": 0.6478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11331, "tokens_per_second_per_gpu": 11214.64, "total_tokens": 1118910950 }, { "epoch": 0.7084271067766942, "grad_norm": 0.9457775950431824, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11332, "tokens_per_second_per_gpu": 11089.62, "total_tokens": 1119009445 }, { "epoch": 0.7084896224056014, "grad_norm": 0.8541898131370544, "learning_rate": 2e-05, "loss": 0.599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11333, "tokens_per_second_per_gpu": 11017.09, "total_tokens": 1119111220 }, { "epoch": 0.7085521380345087, "grad_norm": 0.8711912631988525, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11334, "tokens_per_second_per_gpu": 10407.54, "total_tokens": 1119208498 }, { "epoch": 0.7086146536634158, "grad_norm": 0.8806432485580444, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11335, "tokens_per_second_per_gpu": 10644.45, "total_tokens": 1119308140 }, { "epoch": 0.708677169292323, "grad_norm": 0.8670042157173157, "learning_rate": 2e-05, "loss": 0.6082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11336, "tokens_per_second_per_gpu": 10307.68, "total_tokens": 1119406651 }, { "epoch": 0.7087396849212303, "grad_norm": 0.8825917840003967, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11337, "tokens_per_second_per_gpu": 9415.39, "total_tokens": 1119501399 }, { "epoch": 0.7088022005501375, "grad_norm": 0.8953518271446228, "learning_rate": 2e-05, "loss": 0.6046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11338, "tokens_per_second_per_gpu": 9742.28, "total_tokens": 1119599131 }, { "epoch": 0.7088647161790448, "grad_norm": 0.8916895389556885, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11339, "tokens_per_second_per_gpu": 9978.32, "total_tokens": 1119697516 }, { "epoch": 0.708927231807952, "grad_norm": 0.8584312796592712, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11340, "tokens_per_second_per_gpu": 10933.59, "total_tokens": 1119796620 }, { "epoch": 0.7089897474368592, "grad_norm": 0.8699285387992859, "learning_rate": 2e-05, "loss": 0.5804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11341, "tokens_per_second_per_gpu": 9993.49, "total_tokens": 1119891211 }, { "epoch": 0.7090522630657664, "grad_norm": 0.9005908370018005, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11342, "tokens_per_second_per_gpu": 10313.52, "total_tokens": 1119990968 }, { "epoch": 0.7091147786946737, "grad_norm": 0.875180721282959, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11343, "tokens_per_second_per_gpu": 9938.93, "total_tokens": 1120086243 }, { "epoch": 0.7091772943235809, "grad_norm": 1.1359773874282837, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11344, "tokens_per_second_per_gpu": 9574.51, "total_tokens": 1120181309 }, { "epoch": 0.7092398099524881, "grad_norm": 0.9314468502998352, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11345, "tokens_per_second_per_gpu": 9064.14, "total_tokens": 1120273689 }, { "epoch": 0.7093023255813954, "grad_norm": 0.9109925627708435, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11346, "tokens_per_second_per_gpu": 10420.93, "total_tokens": 1120370254 }, { "epoch": 0.7093648412103025, "grad_norm": 0.9053341150283813, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11347, "tokens_per_second_per_gpu": 10485.98, "total_tokens": 1120466727 }, { "epoch": 0.7094273568392098, "grad_norm": 0.9444143772125244, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11348, "tokens_per_second_per_gpu": 10241.21, "total_tokens": 1120562648 }, { "epoch": 0.709489872468117, "grad_norm": 0.906632661819458, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11349, "tokens_per_second_per_gpu": 9964.73, "total_tokens": 1120658667 }, { "epoch": 0.7095523880970243, "grad_norm": 0.8710077404975891, "learning_rate": 2e-05, "loss": 0.5858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11350, "tokens_per_second_per_gpu": 11088.34, "total_tokens": 1120758192 }, { "epoch": 0.7096149037259315, "grad_norm": 0.90675288438797, "learning_rate": 2e-05, "loss": 0.6596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11351, "tokens_per_second_per_gpu": 11002.69, "total_tokens": 1120860861 }, { "epoch": 0.7096774193548387, "grad_norm": 0.9381788372993469, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11352, "tokens_per_second_per_gpu": 10485.14, "total_tokens": 1120957041 }, { "epoch": 0.709739934983746, "grad_norm": 0.8643229007720947, "learning_rate": 2e-05, "loss": 0.6778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11353, "tokens_per_second_per_gpu": 11076.57, "total_tokens": 1121061229 }, { "epoch": 0.7098024506126531, "grad_norm": 0.9081532955169678, "learning_rate": 2e-05, "loss": 0.6892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11354, "tokens_per_second_per_gpu": 9787.33, "total_tokens": 1121160394 }, { "epoch": 0.7098649662415604, "grad_norm": 0.9160919189453125, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11355, "tokens_per_second_per_gpu": 10527.81, "total_tokens": 1121256330 }, { "epoch": 0.7099274818704676, "grad_norm": 0.8755993247032166, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11356, "tokens_per_second_per_gpu": 10153.73, "total_tokens": 1121352597 }, { "epoch": 0.7099899974993749, "grad_norm": 0.8964266777038574, "learning_rate": 2e-05, "loss": 0.652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11357, "tokens_per_second_per_gpu": 9893.22, "total_tokens": 1121449320 }, { "epoch": 0.7100525131282821, "grad_norm": 0.9053621888160706, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11358, "tokens_per_second_per_gpu": 10642.88, "total_tokens": 1121547656 }, { "epoch": 0.7101150287571893, "grad_norm": 0.924961507320404, "learning_rate": 2e-05, "loss": 0.6507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11359, "tokens_per_second_per_gpu": 10884.69, "total_tokens": 1121647222 }, { "epoch": 0.7101775443860965, "grad_norm": 0.8834236264228821, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11360, "tokens_per_second_per_gpu": 9535.8, "total_tokens": 1121742590 }, { "epoch": 0.7102400600150037, "grad_norm": 0.8815512657165527, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11361, "tokens_per_second_per_gpu": 9900.89, "total_tokens": 1121840054 }, { "epoch": 0.710302575643911, "grad_norm": 0.8865101337432861, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11362, "tokens_per_second_per_gpu": 10808.34, "total_tokens": 1121944526 }, { "epoch": 0.7103650912728182, "grad_norm": 0.9566531181335449, "learning_rate": 2e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11363, "tokens_per_second_per_gpu": 10983.68, "total_tokens": 1122045505 }, { "epoch": 0.7104276069017255, "grad_norm": 0.9172110557556152, "learning_rate": 2e-05, "loss": 0.6551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11364, "tokens_per_second_per_gpu": 11135.12, "total_tokens": 1122143443 }, { "epoch": 0.7104901225306327, "grad_norm": 0.8862615823745728, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11365, "tokens_per_second_per_gpu": 9640.54, "total_tokens": 1122237291 }, { "epoch": 0.7105526381595398, "grad_norm": 0.8756667971611023, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11366, "tokens_per_second_per_gpu": 10314.63, "total_tokens": 1122337984 }, { "epoch": 0.7106151537884471, "grad_norm": 0.9064557552337646, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11367, "tokens_per_second_per_gpu": 10080.15, "total_tokens": 1122435951 }, { "epoch": 0.7106776694173543, "grad_norm": 0.9427421689033508, "learning_rate": 2e-05, "loss": 0.5907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11368, "tokens_per_second_per_gpu": 9681.83, "total_tokens": 1122527972 }, { "epoch": 0.7107401850462616, "grad_norm": 0.9071219563484192, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11369, "tokens_per_second_per_gpu": 11264.64, "total_tokens": 1122629775 }, { "epoch": 0.7108027006751688, "grad_norm": 0.9042165279388428, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11370, "tokens_per_second_per_gpu": 11069.31, "total_tokens": 1122728455 }, { "epoch": 0.7108652163040761, "grad_norm": 0.8549469113349915, "learning_rate": 2e-05, "loss": 0.5935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11371, "tokens_per_second_per_gpu": 10293.77, "total_tokens": 1122821576 }, { "epoch": 0.7109277319329832, "grad_norm": 0.9058767557144165, "learning_rate": 2e-05, "loss": 0.6226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11372, "tokens_per_second_per_gpu": 10668.61, "total_tokens": 1122920637 }, { "epoch": 0.7109902475618904, "grad_norm": 0.8824875354766846, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11373, "tokens_per_second_per_gpu": 10244.71, "total_tokens": 1123019638 }, { "epoch": 0.7110527631907977, "grad_norm": 0.9169154763221741, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11374, "tokens_per_second_per_gpu": 9530.99, "total_tokens": 1123112365 }, { "epoch": 0.7111152788197049, "grad_norm": 0.8916794061660767, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11375, "tokens_per_second_per_gpu": 10479.94, "total_tokens": 1123211454 }, { "epoch": 0.7111777944486122, "grad_norm": 0.8714593648910522, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11376, "tokens_per_second_per_gpu": 10771.11, "total_tokens": 1123309155 }, { "epoch": 0.7112403100775194, "grad_norm": 0.87166428565979, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11377, "tokens_per_second_per_gpu": 10363.94, "total_tokens": 1123407553 }, { "epoch": 0.7113028257064266, "grad_norm": 0.904425859451294, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11378, "tokens_per_second_per_gpu": 10785.21, "total_tokens": 1123506413 }, { "epoch": 0.7113653413353338, "grad_norm": 0.8826570510864258, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11379, "tokens_per_second_per_gpu": 10932.22, "total_tokens": 1123603664 }, { "epoch": 0.711427856964241, "grad_norm": 0.935448944568634, "learning_rate": 2e-05, "loss": 0.5972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11380, "tokens_per_second_per_gpu": 10162.09, "total_tokens": 1123697562 }, { "epoch": 0.7114903725931483, "grad_norm": 0.9066961407661438, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11381, "tokens_per_second_per_gpu": 10878.38, "total_tokens": 1123797071 }, { "epoch": 0.7115528882220555, "grad_norm": 0.9207942485809326, "learning_rate": 2e-05, "loss": 0.6482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11382, "tokens_per_second_per_gpu": 10141.81, "total_tokens": 1123896632 }, { "epoch": 0.7116154038509628, "grad_norm": 0.931620180606842, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11383, "tokens_per_second_per_gpu": 9840.84, "total_tokens": 1123992202 }, { "epoch": 0.7116779194798699, "grad_norm": 0.9219951033592224, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11384, "tokens_per_second_per_gpu": 10059.89, "total_tokens": 1124088253 }, { "epoch": 0.7117404351087772, "grad_norm": 0.8985115885734558, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11385, "tokens_per_second_per_gpu": 11180.04, "total_tokens": 1124184220 }, { "epoch": 0.7118029507376844, "grad_norm": 0.8873980641365051, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11386, "tokens_per_second_per_gpu": 10511.88, "total_tokens": 1124281181 }, { "epoch": 0.7118654663665916, "grad_norm": 0.8775604367256165, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11387, "tokens_per_second_per_gpu": 10400.87, "total_tokens": 1124381006 }, { "epoch": 0.7119279819954989, "grad_norm": 0.9086427688598633, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11388, "tokens_per_second_per_gpu": 9657.9, "total_tokens": 1124476279 }, { "epoch": 0.7119904976244061, "grad_norm": 0.950002133846283, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11389, "tokens_per_second_per_gpu": 9657.39, "total_tokens": 1124567157 }, { "epoch": 0.7120530132533134, "grad_norm": 0.862183153629303, "learning_rate": 2e-05, "loss": 0.5868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11390, "tokens_per_second_per_gpu": 9389.04, "total_tokens": 1124657680 }, { "epoch": 0.7121155288822205, "grad_norm": 0.93153977394104, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11391, "tokens_per_second_per_gpu": 9425.98, "total_tokens": 1124748345 }, { "epoch": 0.7121780445111278, "grad_norm": 0.9070084095001221, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11392, "tokens_per_second_per_gpu": 10876.37, "total_tokens": 1124844987 }, { "epoch": 0.712240560140035, "grad_norm": 0.9214729070663452, "learning_rate": 2e-05, "loss": 0.6468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11393, "tokens_per_second_per_gpu": 10144.66, "total_tokens": 1124939252 }, { "epoch": 0.7123030757689423, "grad_norm": 0.9011596441268921, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11394, "tokens_per_second_per_gpu": 10389.25, "total_tokens": 1125038462 }, { "epoch": 0.7123655913978495, "grad_norm": 0.8834110498428345, "learning_rate": 2e-05, "loss": 0.6468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11395, "tokens_per_second_per_gpu": 10610.58, "total_tokens": 1125139164 }, { "epoch": 0.7124281070267567, "grad_norm": 0.9214532971382141, "learning_rate": 2e-05, "loss": 0.6679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11396, "tokens_per_second_per_gpu": 9923.44, "total_tokens": 1125235927 }, { "epoch": 0.7124906226556639, "grad_norm": 0.8881807923316956, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11397, "tokens_per_second_per_gpu": 10205.38, "total_tokens": 1125337681 }, { "epoch": 0.7125531382845711, "grad_norm": 0.8587279915809631, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11398, "tokens_per_second_per_gpu": 9983.93, "total_tokens": 1125437402 }, { "epoch": 0.7126156539134784, "grad_norm": 0.9099457263946533, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11399, "tokens_per_second_per_gpu": 10984.88, "total_tokens": 1125536390 }, { "epoch": 0.7126781695423856, "grad_norm": 0.9544737935066223, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11400, "tokens_per_second_per_gpu": 10747.12, "total_tokens": 1125631238 }, { "epoch": 0.7127406851712929, "grad_norm": 0.9096446633338928, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11401, "tokens_per_second_per_gpu": 10314.41, "total_tokens": 1125728472 }, { "epoch": 0.7128032008002001, "grad_norm": 0.9341713786125183, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11402, "tokens_per_second_per_gpu": 10526.63, "total_tokens": 1125824070 }, { "epoch": 0.7128657164291072, "grad_norm": 0.8895506858825684, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11403, "tokens_per_second_per_gpu": 10897.12, "total_tokens": 1125923595 }, { "epoch": 0.7129282320580145, "grad_norm": 0.9021523594856262, "learning_rate": 2e-05, "loss": 0.6554, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11404, "tokens_per_second_per_gpu": 10360.29, "total_tokens": 1126022009 }, { "epoch": 0.7129907476869217, "grad_norm": 0.9044035077095032, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11405, "tokens_per_second_per_gpu": 11010.15, "total_tokens": 1126121004 }, { "epoch": 0.713053263315829, "grad_norm": 0.901960015296936, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11406, "tokens_per_second_per_gpu": 10299.43, "total_tokens": 1126217028 }, { "epoch": 0.7131157789447362, "grad_norm": 0.8830791115760803, "learning_rate": 2e-05, "loss": 0.5972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11407, "tokens_per_second_per_gpu": 10124.67, "total_tokens": 1126314155 }, { "epoch": 0.7131782945736435, "grad_norm": 0.8979411125183105, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11408, "tokens_per_second_per_gpu": 10518.98, "total_tokens": 1126416207 }, { "epoch": 0.7132408102025506, "grad_norm": 0.9089224338531494, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11409, "tokens_per_second_per_gpu": 10320.45, "total_tokens": 1126512322 }, { "epoch": 0.7133033258314578, "grad_norm": 0.9223006963729858, "learning_rate": 2e-05, "loss": 0.6394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11410, "tokens_per_second_per_gpu": 10406.99, "total_tokens": 1126607950 }, { "epoch": 0.7133658414603651, "grad_norm": 0.9194297194480896, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11411, "tokens_per_second_per_gpu": 10396.11, "total_tokens": 1126700010 }, { "epoch": 0.7134283570892723, "grad_norm": 0.8878701329231262, "learning_rate": 2e-05, "loss": 0.6517, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11412, "tokens_per_second_per_gpu": 10820.0, "total_tokens": 1126800273 }, { "epoch": 0.7134908727181796, "grad_norm": 0.9639647006988525, "learning_rate": 2e-05, "loss": 0.65, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11413, "tokens_per_second_per_gpu": 10112.6, "total_tokens": 1126894875 }, { "epoch": 0.7135533883470868, "grad_norm": 0.9004095196723938, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11414, "tokens_per_second_per_gpu": 9746.84, "total_tokens": 1126994352 }, { "epoch": 0.713615903975994, "grad_norm": 0.898201048374176, "learning_rate": 2e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11415, "tokens_per_second_per_gpu": 9810.07, "total_tokens": 1127090093 }, { "epoch": 0.7136784196049012, "grad_norm": 0.9009096622467041, "learning_rate": 2e-05, "loss": 0.6742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11416, "tokens_per_second_per_gpu": 10792.36, "total_tokens": 1127190227 }, { "epoch": 0.7137409352338084, "grad_norm": 0.9114018082618713, "learning_rate": 2e-05, "loss": 0.6638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11417, "tokens_per_second_per_gpu": 17313.53, "total_tokens": 1127290924 }, { "epoch": 0.7138034508627157, "grad_norm": 0.9275611639022827, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11418, "tokens_per_second_per_gpu": 10653.77, "total_tokens": 1127388757 }, { "epoch": 0.7138659664916229, "grad_norm": 0.9426218271255493, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11419, "tokens_per_second_per_gpu": 10529.96, "total_tokens": 1127484196 }, { "epoch": 0.7139284821205302, "grad_norm": 0.8906933069229126, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11420, "tokens_per_second_per_gpu": 9452.92, "total_tokens": 1127577799 }, { "epoch": 0.7139909977494373, "grad_norm": 0.9277540445327759, "learning_rate": 2e-05, "loss": 0.6297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11421, "tokens_per_second_per_gpu": 10333.37, "total_tokens": 1127674037 }, { "epoch": 0.7140535133783446, "grad_norm": 0.9021030068397522, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11422, "tokens_per_second_per_gpu": 10600.01, "total_tokens": 1127772361 }, { "epoch": 0.7141160290072518, "grad_norm": 0.9237573146820068, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11423, "tokens_per_second_per_gpu": 9946.56, "total_tokens": 1127870332 }, { "epoch": 0.714178544636159, "grad_norm": 0.9841738343238831, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11424, "tokens_per_second_per_gpu": 9947.98, "total_tokens": 1127964984 }, { "epoch": 0.7142410602650663, "grad_norm": 0.8929020762443542, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11425, "tokens_per_second_per_gpu": 10457.17, "total_tokens": 1128065275 }, { "epoch": 0.7143035758939735, "grad_norm": 0.9249874949455261, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11426, "tokens_per_second_per_gpu": 9537.2, "total_tokens": 1128160245 }, { "epoch": 0.7143660915228808, "grad_norm": 0.9211813807487488, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11427, "tokens_per_second_per_gpu": 9601.94, "total_tokens": 1128258372 }, { "epoch": 0.7144286071517879, "grad_norm": 0.9232777953147888, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11428, "tokens_per_second_per_gpu": 10164.67, "total_tokens": 1128357775 }, { "epoch": 0.7144911227806952, "grad_norm": 0.9132519960403442, "learning_rate": 2e-05, "loss": 0.5861, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11429, "tokens_per_second_per_gpu": 9328.9, "total_tokens": 1128446062 }, { "epoch": 0.7145536384096024, "grad_norm": 0.8916193246841431, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11430, "tokens_per_second_per_gpu": 10385.89, "total_tokens": 1128545840 }, { "epoch": 0.7146161540385096, "grad_norm": 0.8848522305488586, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11431, "tokens_per_second_per_gpu": 10851.58, "total_tokens": 1128645918 }, { "epoch": 0.7146786696674169, "grad_norm": 0.8476694822311401, "learning_rate": 2e-05, "loss": 0.6087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11432, "tokens_per_second_per_gpu": 10939.61, "total_tokens": 1128749043 }, { "epoch": 0.7147411852963241, "grad_norm": 0.8591250777244568, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11433, "tokens_per_second_per_gpu": 10680.36, "total_tokens": 1128847403 }, { "epoch": 0.7148037009252313, "grad_norm": 0.8935761451721191, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11434, "tokens_per_second_per_gpu": 10861.86, "total_tokens": 1128947102 }, { "epoch": 0.7148662165541385, "grad_norm": 0.897510290145874, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11435, "tokens_per_second_per_gpu": 10783.36, "total_tokens": 1129048829 }, { "epoch": 0.7149287321830458, "grad_norm": 0.9194402098655701, "learning_rate": 2e-05, "loss": 0.5989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11436, "tokens_per_second_per_gpu": 10394.95, "total_tokens": 1129146060 }, { "epoch": 0.714991247811953, "grad_norm": 0.9799702763557434, "learning_rate": 2e-05, "loss": 0.5871, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11437, "tokens_per_second_per_gpu": 10545.56, "total_tokens": 1129245415 }, { "epoch": 0.7150537634408602, "grad_norm": 0.9133767485618591, "learning_rate": 2e-05, "loss": 0.5897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11438, "tokens_per_second_per_gpu": 10291.2, "total_tokens": 1129340296 }, { "epoch": 0.7151162790697675, "grad_norm": 0.8998480439186096, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11439, "tokens_per_second_per_gpu": 10088.38, "total_tokens": 1129439110 }, { "epoch": 0.7151787946986746, "grad_norm": 0.956751823425293, "learning_rate": 2e-05, "loss": 0.6348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11440, "tokens_per_second_per_gpu": 10097.57, "total_tokens": 1129534945 }, { "epoch": 0.7152413103275819, "grad_norm": 0.9618973135948181, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11441, "tokens_per_second_per_gpu": 11267.21, "total_tokens": 1129636252 }, { "epoch": 0.7153038259564891, "grad_norm": 0.9598008394241333, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11442, "tokens_per_second_per_gpu": 9651.46, "total_tokens": 1129732583 }, { "epoch": 0.7153663415853964, "grad_norm": 0.9126755595207214, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11443, "tokens_per_second_per_gpu": 10187.48, "total_tokens": 1129832996 }, { "epoch": 0.7154288572143036, "grad_norm": 0.9312610030174255, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11444, "tokens_per_second_per_gpu": 10027.1, "total_tokens": 1129928550 }, { "epoch": 0.7154913728432108, "grad_norm": 0.8989867568016052, "learning_rate": 2e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11445, "tokens_per_second_per_gpu": 10817.54, "total_tokens": 1130026624 }, { "epoch": 0.715553888472118, "grad_norm": 0.8800402879714966, "learning_rate": 2e-05, "loss": 0.5967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11446, "tokens_per_second_per_gpu": 10439.14, "total_tokens": 1130126105 }, { "epoch": 0.7156164041010252, "grad_norm": 0.898642897605896, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11447, "tokens_per_second_per_gpu": 11585.24, "total_tokens": 1130231859 }, { "epoch": 0.7156789197299325, "grad_norm": 0.974114179611206, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11448, "tokens_per_second_per_gpu": 10535.93, "total_tokens": 1130330032 }, { "epoch": 0.7157414353588397, "grad_norm": 0.9135082960128784, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11449, "tokens_per_second_per_gpu": 9923.63, "total_tokens": 1130424290 }, { "epoch": 0.715803950987747, "grad_norm": 0.8803011775016785, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11450, "tokens_per_second_per_gpu": 11005.38, "total_tokens": 1130522666 }, { "epoch": 0.7158664666166542, "grad_norm": 0.8972539305686951, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11451, "tokens_per_second_per_gpu": 9293.93, "total_tokens": 1130618263 }, { "epoch": 0.7159289822455613, "grad_norm": 0.8832900524139404, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11452, "tokens_per_second_per_gpu": 10646.7, "total_tokens": 1130717617 }, { "epoch": 0.7159914978744686, "grad_norm": 0.9388825297355652, "learning_rate": 2e-05, "loss": 0.6515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11453, "tokens_per_second_per_gpu": 11251.61, "total_tokens": 1130819853 }, { "epoch": 0.7160540135033758, "grad_norm": 0.9354299902915955, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11454, "tokens_per_second_per_gpu": 9980.5, "total_tokens": 1130914559 }, { "epoch": 0.7161165291322831, "grad_norm": 0.9180507659912109, "learning_rate": 2e-05, "loss": 0.5938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11455, "tokens_per_second_per_gpu": 9853.5, "total_tokens": 1131011766 }, { "epoch": 0.7161790447611903, "grad_norm": 0.9427361488342285, "learning_rate": 2e-05, "loss": 0.5818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11456, "tokens_per_second_per_gpu": 9770.77, "total_tokens": 1131102663 }, { "epoch": 0.7162415603900976, "grad_norm": 0.8436315059661865, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11457, "tokens_per_second_per_gpu": 10783.97, "total_tokens": 1131204210 }, { "epoch": 0.7163040760190047, "grad_norm": 0.8712595701217651, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11458, "tokens_per_second_per_gpu": 11007.19, "total_tokens": 1131306189 }, { "epoch": 0.7163665916479119, "grad_norm": 0.9181686639785767, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11459, "tokens_per_second_per_gpu": 10263.01, "total_tokens": 1131406985 }, { "epoch": 0.7164291072768192, "grad_norm": 0.9347494840621948, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11460, "tokens_per_second_per_gpu": 10912.82, "total_tokens": 1131508581 }, { "epoch": 0.7164916229057264, "grad_norm": 0.8763430714607239, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11461, "tokens_per_second_per_gpu": 10675.96, "total_tokens": 1131611619 }, { "epoch": 0.7165541385346337, "grad_norm": 0.8813910484313965, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11462, "tokens_per_second_per_gpu": 10508.39, "total_tokens": 1131709630 }, { "epoch": 0.7166166541635409, "grad_norm": 0.962541937828064, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11463, "tokens_per_second_per_gpu": 10430.04, "total_tokens": 1131805367 }, { "epoch": 0.7166791697924482, "grad_norm": 0.8890324831008911, "learning_rate": 2e-05, "loss": 0.5823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11464, "tokens_per_second_per_gpu": 9908.73, "total_tokens": 1131900824 }, { "epoch": 0.7167416854213553, "grad_norm": 0.9149238467216492, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11465, "tokens_per_second_per_gpu": 10014.26, "total_tokens": 1131993204 }, { "epoch": 0.7168042010502625, "grad_norm": 0.8895484805107117, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11466, "tokens_per_second_per_gpu": 10361.74, "total_tokens": 1132094468 }, { "epoch": 0.7168667166791698, "grad_norm": 0.9037408828735352, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11467, "tokens_per_second_per_gpu": 10233.35, "total_tokens": 1132188298 }, { "epoch": 0.716929232308077, "grad_norm": 0.8911735415458679, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11468, "tokens_per_second_per_gpu": 11632.94, "total_tokens": 1132292750 }, { "epoch": 0.7169917479369843, "grad_norm": 0.884797215461731, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11469, "tokens_per_second_per_gpu": 10522.21, "total_tokens": 1132393151 }, { "epoch": 0.7170542635658915, "grad_norm": 0.8495851755142212, "learning_rate": 2e-05, "loss": 0.563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11470, "tokens_per_second_per_gpu": 10621.04, "total_tokens": 1132490156 }, { "epoch": 0.7171167791947987, "grad_norm": 0.9100335240364075, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11471, "tokens_per_second_per_gpu": 10513.35, "total_tokens": 1132586847 }, { "epoch": 0.7171792948237059, "grad_norm": 0.8946272134780884, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11472, "tokens_per_second_per_gpu": 10513.03, "total_tokens": 1132687514 }, { "epoch": 0.7172418104526131, "grad_norm": 0.8625872135162354, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11473, "tokens_per_second_per_gpu": 10128.02, "total_tokens": 1132785071 }, { "epoch": 0.7173043260815204, "grad_norm": 0.920897364616394, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11474, "tokens_per_second_per_gpu": 10565.78, "total_tokens": 1132883989 }, { "epoch": 0.7173668417104276, "grad_norm": 0.8516911268234253, "learning_rate": 2e-05, "loss": 0.5975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11475, "tokens_per_second_per_gpu": 10930.49, "total_tokens": 1132985344 }, { "epoch": 0.7174293573393349, "grad_norm": 0.8783624172210693, "learning_rate": 2e-05, "loss": 0.6374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11476, "tokens_per_second_per_gpu": 11164.66, "total_tokens": 1133088465 }, { "epoch": 0.717491872968242, "grad_norm": 0.8990813493728638, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11477, "tokens_per_second_per_gpu": 10336.74, "total_tokens": 1133184617 }, { "epoch": 0.7175543885971493, "grad_norm": 1.0608166456222534, "learning_rate": 2e-05, "loss": 0.6582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11478, "tokens_per_second_per_gpu": 10068.21, "total_tokens": 1133283075 }, { "epoch": 0.7176169042260565, "grad_norm": 0.8722541928291321, "learning_rate": 2e-05, "loss": 0.5567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11479, "tokens_per_second_per_gpu": 9853.36, "total_tokens": 1133377851 }, { "epoch": 0.7176794198549638, "grad_norm": 0.8784239292144775, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11480, "tokens_per_second_per_gpu": 10318.06, "total_tokens": 1133475769 }, { "epoch": 0.717741935483871, "grad_norm": 0.8716219663619995, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11481, "tokens_per_second_per_gpu": 10212.92, "total_tokens": 1133575174 }, { "epoch": 0.7178044511127782, "grad_norm": 1.1050001382827759, "learning_rate": 2e-05, "loss": 0.6087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11482, "tokens_per_second_per_gpu": 11116.51, "total_tokens": 1133676777 }, { "epoch": 0.7178669667416854, "grad_norm": 0.9113957285881042, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11483, "tokens_per_second_per_gpu": 9912.28, "total_tokens": 1133770081 }, { "epoch": 0.7179294823705926, "grad_norm": 0.867030918598175, "learning_rate": 2e-05, "loss": 0.5695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11484, "tokens_per_second_per_gpu": 10159.44, "total_tokens": 1133868996 }, { "epoch": 0.7179919979994999, "grad_norm": 0.8782222270965576, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11485, "tokens_per_second_per_gpu": 11001.38, "total_tokens": 1133969762 }, { "epoch": 0.7180545136284071, "grad_norm": 0.9164528250694275, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11486, "tokens_per_second_per_gpu": 10249.77, "total_tokens": 1134068044 }, { "epoch": 0.7181170292573144, "grad_norm": 0.8831957578659058, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11487, "tokens_per_second_per_gpu": 11023.07, "total_tokens": 1134166989 }, { "epoch": 0.7181795448862216, "grad_norm": 0.8574967384338379, "learning_rate": 2e-05, "loss": 0.5539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11488, "tokens_per_second_per_gpu": 9803.22, "total_tokens": 1134261613 }, { "epoch": 0.7182420605151287, "grad_norm": 0.9246487021446228, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11489, "tokens_per_second_per_gpu": 9441.65, "total_tokens": 1134356323 }, { "epoch": 0.718304576144036, "grad_norm": 0.8911385536193848, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11490, "tokens_per_second_per_gpu": 10803.19, "total_tokens": 1134456988 }, { "epoch": 0.7183670917729432, "grad_norm": 0.8868467807769775, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11491, "tokens_per_second_per_gpu": 11393.16, "total_tokens": 1134555686 }, { "epoch": 0.7184296074018505, "grad_norm": 0.9559859037399292, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11492, "tokens_per_second_per_gpu": 10031.26, "total_tokens": 1134651102 }, { "epoch": 0.7184921230307577, "grad_norm": 0.9695863723754883, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11493, "tokens_per_second_per_gpu": 10187.56, "total_tokens": 1134745852 }, { "epoch": 0.718554638659665, "grad_norm": 0.9137144088745117, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11494, "tokens_per_second_per_gpu": 10814.64, "total_tokens": 1134843555 }, { "epoch": 0.7186171542885721, "grad_norm": 0.8980168104171753, "learning_rate": 2e-05, "loss": 0.6725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11495, "tokens_per_second_per_gpu": 10232.04, "total_tokens": 1134943227 }, { "epoch": 0.7186796699174793, "grad_norm": 0.9022741317749023, "learning_rate": 2e-05, "loss": 0.6678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11496, "tokens_per_second_per_gpu": 11176.79, "total_tokens": 1135049399 }, { "epoch": 0.7187421855463866, "grad_norm": 0.8929000496864319, "learning_rate": 2e-05, "loss": 0.5799, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11497, "tokens_per_second_per_gpu": 9552.44, "total_tokens": 1135148169 }, { "epoch": 0.7188047011752938, "grad_norm": 0.8742545247077942, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11498, "tokens_per_second_per_gpu": 10322.9, "total_tokens": 1135248431 }, { "epoch": 0.7188672168042011, "grad_norm": 0.9096708297729492, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11499, "tokens_per_second_per_gpu": 9699.94, "total_tokens": 1135344840 }, { "epoch": 0.7189297324331083, "grad_norm": 0.8743032813072205, "learning_rate": 2e-05, "loss": 0.595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11500, "tokens_per_second_per_gpu": 10267.29, "total_tokens": 1135441454 }, { "epoch": 0.7189922480620154, "grad_norm": 0.8590759634971619, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11501, "tokens_per_second_per_gpu": 10144.16, "total_tokens": 1135542305 }, { "epoch": 0.7190547636909227, "grad_norm": 0.8843773603439331, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11502, "tokens_per_second_per_gpu": 11004.01, "total_tokens": 1135644024 }, { "epoch": 0.7191172793198299, "grad_norm": 0.8975203037261963, "learning_rate": 2e-05, "loss": 0.657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11503, "tokens_per_second_per_gpu": 10856.07, "total_tokens": 1135746950 }, { "epoch": 0.7191797949487372, "grad_norm": 0.9291725158691406, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11504, "tokens_per_second_per_gpu": 9805.9, "total_tokens": 1135839496 }, { "epoch": 0.7192423105776444, "grad_norm": 0.8578048348426819, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11505, "tokens_per_second_per_gpu": 10202.79, "total_tokens": 1135938672 }, { "epoch": 0.7193048262065517, "grad_norm": 0.8757208585739136, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11506, "tokens_per_second_per_gpu": 9895.08, "total_tokens": 1136031714 }, { "epoch": 0.7193673418354589, "grad_norm": 0.8588622808456421, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11507, "tokens_per_second_per_gpu": 10750.2, "total_tokens": 1136130988 }, { "epoch": 0.719429857464366, "grad_norm": 0.8720876574516296, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11508, "tokens_per_second_per_gpu": 10699.65, "total_tokens": 1136229119 }, { "epoch": 0.7194923730932733, "grad_norm": 0.904777467250824, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11509, "tokens_per_second_per_gpu": 10323.26, "total_tokens": 1136326212 }, { "epoch": 0.7195548887221805, "grad_norm": 0.9172138571739197, "learning_rate": 2e-05, "loss": 0.5988, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11510, "tokens_per_second_per_gpu": 9542.5, "total_tokens": 1136421992 }, { "epoch": 0.7196174043510878, "grad_norm": 0.9187159538269043, "learning_rate": 2e-05, "loss": 0.6479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11511, "tokens_per_second_per_gpu": 9759.63, "total_tokens": 1136519689 }, { "epoch": 0.719679919979995, "grad_norm": 0.8917601704597473, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11512, "tokens_per_second_per_gpu": 10145.06, "total_tokens": 1136618165 }, { "epoch": 0.7197424356089023, "grad_norm": 0.9009239077568054, "learning_rate": 2e-05, "loss": 0.6588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11513, "tokens_per_second_per_gpu": 11017.55, "total_tokens": 1136722613 }, { "epoch": 0.7198049512378094, "grad_norm": 0.9130361080169678, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11514, "tokens_per_second_per_gpu": 10583.48, "total_tokens": 1136819227 }, { "epoch": 0.7198674668667167, "grad_norm": 0.9010123610496521, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11515, "tokens_per_second_per_gpu": 10592.55, "total_tokens": 1136915194 }, { "epoch": 0.7199299824956239, "grad_norm": 0.8778800964355469, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11516, "tokens_per_second_per_gpu": 10641.86, "total_tokens": 1137016047 }, { "epoch": 0.7199924981245311, "grad_norm": 0.9024880528450012, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11517, "tokens_per_second_per_gpu": 10834.0, "total_tokens": 1137115316 }, { "epoch": 0.7200550137534384, "grad_norm": 0.8683107495307922, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11518, "tokens_per_second_per_gpu": 10530.67, "total_tokens": 1137216446 }, { "epoch": 0.7201175293823456, "grad_norm": 0.8800868988037109, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11519, "tokens_per_second_per_gpu": 11191.99, "total_tokens": 1137318788 }, { "epoch": 0.7201800450112528, "grad_norm": 0.8846100568771362, "learning_rate": 2e-05, "loss": 0.6508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11520, "tokens_per_second_per_gpu": 10391.14, "total_tokens": 1137418461 }, { "epoch": 0.72024256064016, "grad_norm": 0.8720064163208008, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11521, "tokens_per_second_per_gpu": 9527.34, "total_tokens": 1137517835 }, { "epoch": 0.7203050762690673, "grad_norm": 0.8985047936439514, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11522, "tokens_per_second_per_gpu": 10704.77, "total_tokens": 1137618852 }, { "epoch": 0.7203675918979745, "grad_norm": 0.8909242749214172, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11523, "tokens_per_second_per_gpu": 10274.45, "total_tokens": 1137717325 }, { "epoch": 0.7204301075268817, "grad_norm": 0.9135667681694031, "learning_rate": 2e-05, "loss": 0.6052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11524, "tokens_per_second_per_gpu": 9214.11, "total_tokens": 1137810960 }, { "epoch": 0.720492623155789, "grad_norm": 0.8852685689926147, "learning_rate": 2e-05, "loss": 0.6511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11525, "tokens_per_second_per_gpu": 11194.79, "total_tokens": 1137913996 }, { "epoch": 0.7205551387846961, "grad_norm": 0.9133703112602234, "learning_rate": 2e-05, "loss": 0.6721, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11526, "tokens_per_second_per_gpu": 10821.32, "total_tokens": 1138012129 }, { "epoch": 0.7206176544136034, "grad_norm": 0.8793713450431824, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11527, "tokens_per_second_per_gpu": 10297.96, "total_tokens": 1138114085 }, { "epoch": 0.7206801700425106, "grad_norm": 0.8916499614715576, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11528, "tokens_per_second_per_gpu": 10705.32, "total_tokens": 1138212576 }, { "epoch": 0.7207426856714179, "grad_norm": 0.8385809063911438, "learning_rate": 2e-05, "loss": 0.5764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11529, "tokens_per_second_per_gpu": 10405.63, "total_tokens": 1138313972 }, { "epoch": 0.7208052013003251, "grad_norm": 0.845072329044342, "learning_rate": 2e-05, "loss": 0.5915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11530, "tokens_per_second_per_gpu": 10434.94, "total_tokens": 1138412918 }, { "epoch": 0.7208677169292323, "grad_norm": 0.9026440978050232, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11531, "tokens_per_second_per_gpu": 10623.99, "total_tokens": 1138511794 }, { "epoch": 0.7209302325581395, "grad_norm": 0.894886314868927, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11532, "tokens_per_second_per_gpu": 10419.62, "total_tokens": 1138610125 }, { "epoch": 0.7209927481870467, "grad_norm": 0.9171721339225769, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11533, "tokens_per_second_per_gpu": 10335.22, "total_tokens": 1138710406 }, { "epoch": 0.721055263815954, "grad_norm": 0.8824079036712646, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11534, "tokens_per_second_per_gpu": 9832.77, "total_tokens": 1138807581 }, { "epoch": 0.7211177794448612, "grad_norm": 0.9212815761566162, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11535, "tokens_per_second_per_gpu": 10116.84, "total_tokens": 1138904524 }, { "epoch": 0.7211802950737685, "grad_norm": 0.9227977395057678, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11536, "tokens_per_second_per_gpu": 10407.99, "total_tokens": 1139000342 }, { "epoch": 0.7212428107026757, "grad_norm": 0.8746113181114197, "learning_rate": 2e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11537, "tokens_per_second_per_gpu": 10929.4, "total_tokens": 1139101736 }, { "epoch": 0.7213053263315828, "grad_norm": 0.924368143081665, "learning_rate": 2e-05, "loss": 0.5775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11538, "tokens_per_second_per_gpu": 10381.37, "total_tokens": 1139201466 }, { "epoch": 0.7213678419604901, "grad_norm": 0.8788226842880249, "learning_rate": 2e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11539, "tokens_per_second_per_gpu": 10403.07, "total_tokens": 1139304874 }, { "epoch": 0.7214303575893973, "grad_norm": 0.8694568276405334, "learning_rate": 2e-05, "loss": 0.5847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11540, "tokens_per_second_per_gpu": 10475.31, "total_tokens": 1139402062 }, { "epoch": 0.7214928732183046, "grad_norm": 0.8789458274841309, "learning_rate": 2e-05, "loss": 0.5841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11541, "tokens_per_second_per_gpu": 10813.25, "total_tokens": 1139501219 }, { "epoch": 0.7215553888472118, "grad_norm": 0.876189649105072, "learning_rate": 2e-05, "loss": 0.6507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11542, "tokens_per_second_per_gpu": 10391.05, "total_tokens": 1139602030 }, { "epoch": 0.7216179044761191, "grad_norm": 0.8949794173240662, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11543, "tokens_per_second_per_gpu": 10690.3, "total_tokens": 1139702628 }, { "epoch": 0.7216804201050263, "grad_norm": 0.8921545147895813, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11544, "tokens_per_second_per_gpu": 10085.08, "total_tokens": 1139798403 }, { "epoch": 0.7217429357339334, "grad_norm": 0.8645642399787903, "learning_rate": 2e-05, "loss": 0.5693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11545, "tokens_per_second_per_gpu": 8974.24, "total_tokens": 1139894536 }, { "epoch": 0.7218054513628407, "grad_norm": 0.8917523622512817, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11546, "tokens_per_second_per_gpu": 9908.91, "total_tokens": 1139995236 }, { "epoch": 0.7218679669917479, "grad_norm": 0.860049843788147, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11547, "tokens_per_second_per_gpu": 10966.32, "total_tokens": 1140098336 }, { "epoch": 0.7219304826206552, "grad_norm": 0.8802114129066467, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11548, "tokens_per_second_per_gpu": 11137.8, "total_tokens": 1140198423 }, { "epoch": 0.7219929982495624, "grad_norm": 0.8853590488433838, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11549, "tokens_per_second_per_gpu": 10163.41, "total_tokens": 1140295626 }, { "epoch": 0.7220555138784697, "grad_norm": 0.9110186100006104, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11550, "tokens_per_second_per_gpu": 10111.67, "total_tokens": 1140392908 }, { "epoch": 0.7221180295073768, "grad_norm": 0.8818939328193665, "learning_rate": 2e-05, "loss": 0.5851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11551, "tokens_per_second_per_gpu": 10514.96, "total_tokens": 1140488538 }, { "epoch": 0.722180545136284, "grad_norm": 0.9260530471801758, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11552, "tokens_per_second_per_gpu": 11219.74, "total_tokens": 1140589237 }, { "epoch": 0.7222430607651913, "grad_norm": 0.9159391522407532, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11553, "tokens_per_second_per_gpu": 10305.8, "total_tokens": 1140690559 }, { "epoch": 0.7223055763940985, "grad_norm": 0.8675487637519836, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11554, "tokens_per_second_per_gpu": 10742.16, "total_tokens": 1140792350 }, { "epoch": 0.7223680920230058, "grad_norm": 0.9117034673690796, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11555, "tokens_per_second_per_gpu": 10818.07, "total_tokens": 1140890470 }, { "epoch": 0.722430607651913, "grad_norm": 0.9038571715354919, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11556, "tokens_per_second_per_gpu": 10124.41, "total_tokens": 1140987825 }, { "epoch": 0.7224931232808202, "grad_norm": 0.8932263851165771, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11557, "tokens_per_second_per_gpu": 9736.62, "total_tokens": 1141084588 }, { "epoch": 0.7225556389097274, "grad_norm": 0.859255313873291, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11558, "tokens_per_second_per_gpu": 10524.1, "total_tokens": 1141184731 }, { "epoch": 0.7226181545386346, "grad_norm": 0.860363781452179, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11559, "tokens_per_second_per_gpu": 10827.48, "total_tokens": 1141283265 }, { "epoch": 0.7226806701675419, "grad_norm": 0.929949164390564, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11560, "tokens_per_second_per_gpu": 10563.46, "total_tokens": 1141377488 }, { "epoch": 0.7227431857964491, "grad_norm": 0.9131263494491577, "learning_rate": 2e-05, "loss": 0.5776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11561, "tokens_per_second_per_gpu": 10419.45, "total_tokens": 1141469504 }, { "epoch": 0.7228057014253564, "grad_norm": 0.8916954398155212, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11562, "tokens_per_second_per_gpu": 10142.28, "total_tokens": 1141565493 }, { "epoch": 0.7228682170542635, "grad_norm": 0.8930199146270752, "learning_rate": 2e-05, "loss": 0.6168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11563, "tokens_per_second_per_gpu": 10124.06, "total_tokens": 1141662154 }, { "epoch": 0.7229307326831708, "grad_norm": 0.9055477380752563, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11564, "tokens_per_second_per_gpu": 10900.98, "total_tokens": 1141763299 }, { "epoch": 0.722993248312078, "grad_norm": 0.9030790328979492, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11565, "tokens_per_second_per_gpu": 10643.91, "total_tokens": 1141864037 }, { "epoch": 0.7230557639409853, "grad_norm": 0.9280545711517334, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11566, "tokens_per_second_per_gpu": 10464.1, "total_tokens": 1141959461 }, { "epoch": 0.7231182795698925, "grad_norm": 0.9088826775550842, "learning_rate": 2e-05, "loss": 0.5881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11567, "tokens_per_second_per_gpu": 10843.45, "total_tokens": 1142057735 }, { "epoch": 0.7231807951987997, "grad_norm": 0.8829676508903503, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11568, "tokens_per_second_per_gpu": 11086.21, "total_tokens": 1142159984 }, { "epoch": 0.7232433108277069, "grad_norm": 0.9002882242202759, "learning_rate": 2e-05, "loss": 0.6168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11569, "tokens_per_second_per_gpu": 10578.47, "total_tokens": 1142258935 }, { "epoch": 0.7233058264566141, "grad_norm": 0.8673859238624573, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11570, "tokens_per_second_per_gpu": 10019.84, "total_tokens": 1142356436 }, { "epoch": 0.7233683420855214, "grad_norm": 0.8890742659568787, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11571, "tokens_per_second_per_gpu": 10740.69, "total_tokens": 1142457229 }, { "epoch": 0.7234308577144286, "grad_norm": 0.919430673122406, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11572, "tokens_per_second_per_gpu": 10730.2, "total_tokens": 1142554355 }, { "epoch": 0.7234933733433359, "grad_norm": 0.8925511837005615, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11573, "tokens_per_second_per_gpu": 10703.21, "total_tokens": 1142653186 }, { "epoch": 0.7235558889722431, "grad_norm": 0.8835618495941162, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11574, "tokens_per_second_per_gpu": 9981.8, "total_tokens": 1142751942 }, { "epoch": 0.7236184046011502, "grad_norm": 0.886284351348877, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11575, "tokens_per_second_per_gpu": 11233.61, "total_tokens": 1142857101 }, { "epoch": 0.7236809202300575, "grad_norm": 0.8576541543006897, "learning_rate": 2e-05, "loss": 0.5925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11576, "tokens_per_second_per_gpu": 11381.49, "total_tokens": 1142958929 }, { "epoch": 0.7237434358589647, "grad_norm": 0.8840327858924866, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11577, "tokens_per_second_per_gpu": 10678.52, "total_tokens": 1143053787 }, { "epoch": 0.723805951487872, "grad_norm": 0.8619066476821899, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11578, "tokens_per_second_per_gpu": 9972.54, "total_tokens": 1143152508 }, { "epoch": 0.7238684671167792, "grad_norm": 0.863606333732605, "learning_rate": 2e-05, "loss": 0.5821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11579, "tokens_per_second_per_gpu": 10360.4, "total_tokens": 1143251176 }, { "epoch": 0.7239309827456865, "grad_norm": 0.8500994443893433, "learning_rate": 2e-05, "loss": 0.5799, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11580, "tokens_per_second_per_gpu": 10688.43, "total_tokens": 1143350374 }, { "epoch": 0.7239934983745937, "grad_norm": 0.8733036518096924, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11581, "tokens_per_second_per_gpu": 10265.63, "total_tokens": 1143449862 }, { "epoch": 0.7240560140035008, "grad_norm": 0.9093345403671265, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11582, "tokens_per_second_per_gpu": 10933.44, "total_tokens": 1143550870 }, { "epoch": 0.7241185296324081, "grad_norm": 0.912787675857544, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11583, "tokens_per_second_per_gpu": 10302.08, "total_tokens": 1143645748 }, { "epoch": 0.7241810452613153, "grad_norm": 0.8816200494766235, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11584, "tokens_per_second_per_gpu": 10651.76, "total_tokens": 1143744868 }, { "epoch": 0.7242435608902226, "grad_norm": 0.9337819218635559, "learning_rate": 2e-05, "loss": 0.6297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11585, "tokens_per_second_per_gpu": 9316.25, "total_tokens": 1143838989 }, { "epoch": 0.7243060765191298, "grad_norm": 0.9148811101913452, "learning_rate": 2e-05, "loss": 0.5973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11586, "tokens_per_second_per_gpu": 10008.62, "total_tokens": 1143934168 }, { "epoch": 0.7243685921480371, "grad_norm": 0.9414291977882385, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11587, "tokens_per_second_per_gpu": 10665.96, "total_tokens": 1144033879 }, { "epoch": 0.7244311077769442, "grad_norm": 0.8709332942962646, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11588, "tokens_per_second_per_gpu": 9785.46, "total_tokens": 1144131110 }, { "epoch": 0.7244936234058514, "grad_norm": 0.882220983505249, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11589, "tokens_per_second_per_gpu": 11340.44, "total_tokens": 1144233965 }, { "epoch": 0.7245561390347587, "grad_norm": 0.9283773899078369, "learning_rate": 2e-05, "loss": 0.6145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11590, "tokens_per_second_per_gpu": 10480.76, "total_tokens": 1144333686 }, { "epoch": 0.7246186546636659, "grad_norm": 0.8769643902778625, "learning_rate": 2e-05, "loss": 0.5967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11591, "tokens_per_second_per_gpu": 10224.27, "total_tokens": 1144429084 }, { "epoch": 0.7246811702925732, "grad_norm": 0.9028669595718384, "learning_rate": 2e-05, "loss": 0.6338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11592, "tokens_per_second_per_gpu": 10908.95, "total_tokens": 1144531066 }, { "epoch": 0.7247436859214804, "grad_norm": 0.8968443870544434, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11593, "tokens_per_second_per_gpu": 10568.54, "total_tokens": 1144628147 }, { "epoch": 0.7248062015503876, "grad_norm": 0.8810968399047852, "learning_rate": 2e-05, "loss": 0.5763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11594, "tokens_per_second_per_gpu": 9928.59, "total_tokens": 1144723733 }, { "epoch": 0.7248687171792948, "grad_norm": 0.8692195415496826, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11595, "tokens_per_second_per_gpu": 10671.25, "total_tokens": 1144823770 }, { "epoch": 0.724931232808202, "grad_norm": 0.8822752833366394, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11596, "tokens_per_second_per_gpu": 10632.56, "total_tokens": 1144921130 }, { "epoch": 0.7249937484371093, "grad_norm": 0.8875386714935303, "learning_rate": 2e-05, "loss": 0.5882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11597, "tokens_per_second_per_gpu": 9819.84, "total_tokens": 1145016070 }, { "epoch": 0.7250562640660165, "grad_norm": 0.8504307866096497, "learning_rate": 2e-05, "loss": 0.5937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11598, "tokens_per_second_per_gpu": 11055.83, "total_tokens": 1145116780 }, { "epoch": 0.7251187796949238, "grad_norm": 0.8860695958137512, "learning_rate": 2e-05, "loss": 0.6046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11599, "tokens_per_second_per_gpu": 9501.32, "total_tokens": 1145212304 }, { "epoch": 0.7251812953238309, "grad_norm": 0.9053046703338623, "learning_rate": 2e-05, "loss": 0.5995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11600, "tokens_per_second_per_gpu": 10372.08, "total_tokens": 1145310812 }, { "epoch": 0.7252438109527382, "grad_norm": 0.9159161448478699, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11601, "tokens_per_second_per_gpu": 10438.49, "total_tokens": 1145406138 }, { "epoch": 0.7253063265816454, "grad_norm": 0.8632400035858154, "learning_rate": 2e-05, "loss": 0.5699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11602, "tokens_per_second_per_gpu": 9732.22, "total_tokens": 1145502743 }, { "epoch": 0.7253688422105526, "grad_norm": 0.8795470595359802, "learning_rate": 2e-05, "loss": 0.5986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11603, "tokens_per_second_per_gpu": 10799.79, "total_tokens": 1145602070 }, { "epoch": 0.7254313578394599, "grad_norm": 0.9194148182868958, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11604, "tokens_per_second_per_gpu": 9795.54, "total_tokens": 1145698373 }, { "epoch": 0.7254938734683671, "grad_norm": 0.9031579494476318, "learning_rate": 2e-05, "loss": 0.6057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11605, "tokens_per_second_per_gpu": 10934.67, "total_tokens": 1145797938 }, { "epoch": 0.7255563890972743, "grad_norm": 0.9153050780296326, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11606, "tokens_per_second_per_gpu": 10138.43, "total_tokens": 1145895794 }, { "epoch": 0.7256189047261815, "grad_norm": 0.9188529253005981, "learning_rate": 2e-05, "loss": 0.6581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11607, "tokens_per_second_per_gpu": 10365.5, "total_tokens": 1145993161 }, { "epoch": 0.7256814203550888, "grad_norm": 0.8832847476005554, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11608, "tokens_per_second_per_gpu": 10477.03, "total_tokens": 1146089001 }, { "epoch": 0.725743935983996, "grad_norm": 0.8993613719940186, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11609, "tokens_per_second_per_gpu": 11059.42, "total_tokens": 1146191015 }, { "epoch": 0.7258064516129032, "grad_norm": 0.8623324036598206, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11610, "tokens_per_second_per_gpu": 10163.23, "total_tokens": 1146286836 }, { "epoch": 0.7258689672418105, "grad_norm": 0.8553829789161682, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11611, "tokens_per_second_per_gpu": 10611.24, "total_tokens": 1146388097 }, { "epoch": 0.7259314828707176, "grad_norm": 0.9017962217330933, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11612, "tokens_per_second_per_gpu": 10421.24, "total_tokens": 1146485314 }, { "epoch": 0.7259939984996249, "grad_norm": 0.9164466261863708, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11613, "tokens_per_second_per_gpu": 9728.66, "total_tokens": 1146580095 }, { "epoch": 0.7260565141285321, "grad_norm": 0.923748791217804, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11614, "tokens_per_second_per_gpu": 10309.85, "total_tokens": 1146676791 }, { "epoch": 0.7261190297574394, "grad_norm": 0.8564788699150085, "learning_rate": 2e-05, "loss": 0.5895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11615, "tokens_per_second_per_gpu": 10687.0, "total_tokens": 1146775461 }, { "epoch": 0.7261815453863466, "grad_norm": 0.8971266150474548, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11616, "tokens_per_second_per_gpu": 10712.38, "total_tokens": 1146876723 }, { "epoch": 0.7262440610152539, "grad_norm": 0.9215987324714661, "learning_rate": 2e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11617, "tokens_per_second_per_gpu": 10643.58, "total_tokens": 1146976397 }, { "epoch": 0.7263065766441611, "grad_norm": 0.8580374121665955, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11618, "tokens_per_second_per_gpu": 11078.26, "total_tokens": 1147077918 }, { "epoch": 0.7263690922730682, "grad_norm": 0.8751099109649658, "learning_rate": 2e-05, "loss": 0.5827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11619, "tokens_per_second_per_gpu": 10642.05, "total_tokens": 1147175391 }, { "epoch": 0.7264316079019755, "grad_norm": 0.8534489274024963, "learning_rate": 2e-05, "loss": 0.654, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11620, "tokens_per_second_per_gpu": 10554.73, "total_tokens": 1147277900 }, { "epoch": 0.7264941235308827, "grad_norm": 0.8822038769721985, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11621, "tokens_per_second_per_gpu": 9887.76, "total_tokens": 1147373919 }, { "epoch": 0.72655663915979, "grad_norm": 0.9456940293312073, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11622, "tokens_per_second_per_gpu": 10607.24, "total_tokens": 1147466011 }, { "epoch": 0.7266191547886972, "grad_norm": 0.8791143298149109, "learning_rate": 2e-05, "loss": 0.5881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11623, "tokens_per_second_per_gpu": 9653.28, "total_tokens": 1147563151 }, { "epoch": 0.7266816704176045, "grad_norm": 0.8838028311729431, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11624, "tokens_per_second_per_gpu": 10344.57, "total_tokens": 1147662095 }, { "epoch": 0.7267441860465116, "grad_norm": 0.8852810263633728, "learning_rate": 2e-05, "loss": 0.6132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11625, "tokens_per_second_per_gpu": 10080.65, "total_tokens": 1147762037 }, { "epoch": 0.7268067016754188, "grad_norm": 0.9075102210044861, "learning_rate": 2e-05, "loss": 0.594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11626, "tokens_per_second_per_gpu": 9979.78, "total_tokens": 1147852764 }, { "epoch": 0.7268692173043261, "grad_norm": 0.9209592342376709, "learning_rate": 2e-05, "loss": 0.6105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11627, "tokens_per_second_per_gpu": 10718.45, "total_tokens": 1147951210 }, { "epoch": 0.7269317329332333, "grad_norm": 0.8699488043785095, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11628, "tokens_per_second_per_gpu": 10799.91, "total_tokens": 1148052109 }, { "epoch": 0.7269942485621406, "grad_norm": 0.9255300164222717, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11629, "tokens_per_second_per_gpu": 10561.26, "total_tokens": 1148147949 }, { "epoch": 0.7270567641910478, "grad_norm": 0.8968694806098938, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11630, "tokens_per_second_per_gpu": 10683.98, "total_tokens": 1148246389 }, { "epoch": 0.727119279819955, "grad_norm": 0.9246377348899841, "learning_rate": 2e-05, "loss": 0.5981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11631, "tokens_per_second_per_gpu": 10249.44, "total_tokens": 1148341348 }, { "epoch": 0.7271817954488622, "grad_norm": 0.9086728096008301, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11632, "tokens_per_second_per_gpu": 10641.97, "total_tokens": 1148441917 }, { "epoch": 0.7272443110777694, "grad_norm": 0.874851644039154, "learning_rate": 2e-05, "loss": 0.663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11633, "tokens_per_second_per_gpu": 9950.48, "total_tokens": 1148542961 }, { "epoch": 0.7273068267066767, "grad_norm": 0.860146164894104, "learning_rate": 2e-05, "loss": 0.5305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11634, "tokens_per_second_per_gpu": 9713.98, "total_tokens": 1148634853 }, { "epoch": 0.7273693423355839, "grad_norm": 0.8764286637306213, "learning_rate": 2e-05, "loss": 0.5882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11635, "tokens_per_second_per_gpu": 9889.55, "total_tokens": 1148730412 }, { "epoch": 0.7274318579644912, "grad_norm": 0.8792805671691895, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11636, "tokens_per_second_per_gpu": 11038.29, "total_tokens": 1148832685 }, { "epoch": 0.7274943735933983, "grad_norm": 0.884699285030365, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11637, "tokens_per_second_per_gpu": 10439.65, "total_tokens": 1148929470 }, { "epoch": 0.7275568892223055, "grad_norm": 0.8598029613494873, "learning_rate": 2e-05, "loss": 0.6008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11638, "tokens_per_second_per_gpu": 10612.05, "total_tokens": 1149030818 }, { "epoch": 0.7276194048512128, "grad_norm": 0.8904111385345459, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11639, "tokens_per_second_per_gpu": 11208.33, "total_tokens": 1149129172 }, { "epoch": 0.72768192048012, "grad_norm": 0.8602427244186401, "learning_rate": 2e-05, "loss": 0.5898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11640, "tokens_per_second_per_gpu": 10523.04, "total_tokens": 1149225876 }, { "epoch": 0.7277444361090273, "grad_norm": 0.9451654553413391, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11641, "tokens_per_second_per_gpu": 10280.33, "total_tokens": 1149322832 }, { "epoch": 0.7278069517379345, "grad_norm": 0.8947023153305054, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11642, "tokens_per_second_per_gpu": 11298.25, "total_tokens": 1149425051 }, { "epoch": 0.7278694673668417, "grad_norm": 0.9198508262634277, "learning_rate": 2e-05, "loss": 0.5824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11643, "tokens_per_second_per_gpu": 9839.67, "total_tokens": 1149520009 }, { "epoch": 0.7279319829957489, "grad_norm": 0.9361582398414612, "learning_rate": 2e-05, "loss": 0.588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11644, "tokens_per_second_per_gpu": 10226.41, "total_tokens": 1149618602 }, { "epoch": 0.7279944986246562, "grad_norm": 0.8767353892326355, "learning_rate": 2e-05, "loss": 0.5878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11645, "tokens_per_second_per_gpu": 9859.48, "total_tokens": 1149709047 }, { "epoch": 0.7280570142535634, "grad_norm": 0.8629885315895081, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11646, "tokens_per_second_per_gpu": 10670.89, "total_tokens": 1149811260 }, { "epoch": 0.7281195298824706, "grad_norm": 0.8601353168487549, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11647, "tokens_per_second_per_gpu": 10130.12, "total_tokens": 1149910526 }, { "epoch": 0.7281820455113779, "grad_norm": 0.8868663311004639, "learning_rate": 2e-05, "loss": 0.6016, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11648, "tokens_per_second_per_gpu": 10316.2, "total_tokens": 1150007359 }, { "epoch": 0.728244561140285, "grad_norm": 0.8475369215011597, "learning_rate": 2e-05, "loss": 0.5992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11649, "tokens_per_second_per_gpu": 10121.78, "total_tokens": 1150107455 }, { "epoch": 0.7283070767691923, "grad_norm": 0.8962197303771973, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11650, "tokens_per_second_per_gpu": 11528.72, "total_tokens": 1150211834 }, { "epoch": 0.7283695923980995, "grad_norm": 0.9016979932785034, "learning_rate": 2e-05, "loss": 0.577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11651, "tokens_per_second_per_gpu": 11452.56, "total_tokens": 1150308556 }, { "epoch": 0.7284321080270068, "grad_norm": 0.8889947533607483, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11652, "tokens_per_second_per_gpu": 14085.38, "total_tokens": 1150407472 }, { "epoch": 0.728494623655914, "grad_norm": 0.8741746544837952, "learning_rate": 2e-05, "loss": 0.6057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11653, "tokens_per_second_per_gpu": 12316.8, "total_tokens": 1150505128 }, { "epoch": 0.7285571392848212, "grad_norm": 0.8457549810409546, "learning_rate": 2e-05, "loss": 0.5933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11654, "tokens_per_second_per_gpu": 10822.89, "total_tokens": 1150605363 }, { "epoch": 0.7286196549137285, "grad_norm": 0.91518235206604, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11655, "tokens_per_second_per_gpu": 10835.95, "total_tokens": 1150702183 }, { "epoch": 0.7286821705426356, "grad_norm": 0.9335306286811829, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11656, "tokens_per_second_per_gpu": 11091.18, "total_tokens": 1150801778 }, { "epoch": 0.7287446861715429, "grad_norm": 0.9321853518486023, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11657, "tokens_per_second_per_gpu": 10153.99, "total_tokens": 1150899328 }, { "epoch": 0.7288072018004501, "grad_norm": 0.9068934321403503, "learning_rate": 2e-05, "loss": 0.6526, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11658, "tokens_per_second_per_gpu": 9931.05, "total_tokens": 1150997879 }, { "epoch": 0.7288697174293574, "grad_norm": 0.9036986231803894, "learning_rate": 2e-05, "loss": 0.6671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11659, "tokens_per_second_per_gpu": 11386.45, "total_tokens": 1151098478 }, { "epoch": 0.7289322330582646, "grad_norm": 0.8915536403656006, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11660, "tokens_per_second_per_gpu": 9684.04, "total_tokens": 1151195821 }, { "epoch": 0.7289947486871718, "grad_norm": 0.8987020254135132, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11661, "tokens_per_second_per_gpu": 10709.05, "total_tokens": 1151298397 }, { "epoch": 0.729057264316079, "grad_norm": 0.8859087824821472, "learning_rate": 2e-05, "loss": 0.6008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11662, "tokens_per_second_per_gpu": 10380.91, "total_tokens": 1151396977 }, { "epoch": 0.7291197799449862, "grad_norm": 0.9162821173667908, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11663, "tokens_per_second_per_gpu": 9819.92, "total_tokens": 1151490984 }, { "epoch": 0.7291822955738935, "grad_norm": 0.9442530870437622, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11664, "tokens_per_second_per_gpu": 10631.15, "total_tokens": 1151590419 }, { "epoch": 0.7292448112028007, "grad_norm": 0.8959223031997681, "learning_rate": 2e-05, "loss": 0.6002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11665, "tokens_per_second_per_gpu": 10993.58, "total_tokens": 1151689861 }, { "epoch": 0.729307326831708, "grad_norm": 0.8766617774963379, "learning_rate": 2e-05, "loss": 0.6017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11666, "tokens_per_second_per_gpu": 10650.82, "total_tokens": 1151789301 }, { "epoch": 0.7293698424606152, "grad_norm": 0.9015414714813232, "learning_rate": 2e-05, "loss": 0.6297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11667, "tokens_per_second_per_gpu": 10376.41, "total_tokens": 1151888281 }, { "epoch": 0.7294323580895223, "grad_norm": 0.9344537258148193, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11668, "tokens_per_second_per_gpu": 10046.17, "total_tokens": 1151989976 }, { "epoch": 0.7294948737184296, "grad_norm": 0.9046867489814758, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11669, "tokens_per_second_per_gpu": 10238.17, "total_tokens": 1152087296 }, { "epoch": 0.7295573893473368, "grad_norm": 0.8437564373016357, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11670, "tokens_per_second_per_gpu": 10977.04, "total_tokens": 1152188753 }, { "epoch": 0.7296199049762441, "grad_norm": 0.8868705630302429, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11671, "tokens_per_second_per_gpu": 11153.78, "total_tokens": 1152289813 }, { "epoch": 0.7296824206051513, "grad_norm": 0.9101773500442505, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11672, "tokens_per_second_per_gpu": 10339.57, "total_tokens": 1152386891 }, { "epoch": 0.7297449362340586, "grad_norm": 0.8669076561927795, "learning_rate": 2e-05, "loss": 0.5857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11673, "tokens_per_second_per_gpu": 10690.57, "total_tokens": 1152487810 }, { "epoch": 0.7298074518629657, "grad_norm": 0.8986690044403076, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11674, "tokens_per_second_per_gpu": 10901.77, "total_tokens": 1152585110 }, { "epoch": 0.7298699674918729, "grad_norm": 0.9219426512718201, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11675, "tokens_per_second_per_gpu": 9787.67, "total_tokens": 1152683343 }, { "epoch": 0.7299324831207802, "grad_norm": 0.8769482374191284, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11676, "tokens_per_second_per_gpu": 10366.29, "total_tokens": 1152783418 }, { "epoch": 0.7299949987496874, "grad_norm": 0.9210414290428162, "learning_rate": 2e-05, "loss": 0.6219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11677, "tokens_per_second_per_gpu": 9821.43, "total_tokens": 1152877248 }, { "epoch": 0.7300575143785947, "grad_norm": 0.859245240688324, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11678, "tokens_per_second_per_gpu": 10675.5, "total_tokens": 1152979423 }, { "epoch": 0.7301200300075019, "grad_norm": 0.8765584826469421, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11679, "tokens_per_second_per_gpu": 10531.98, "total_tokens": 1153078481 }, { "epoch": 0.730182545636409, "grad_norm": 0.8880025744438171, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11680, "tokens_per_second_per_gpu": 11155.26, "total_tokens": 1153179563 }, { "epoch": 0.7302450612653163, "grad_norm": 0.8718317747116089, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11681, "tokens_per_second_per_gpu": 11384.05, "total_tokens": 1153280926 }, { "epoch": 0.7303075768942235, "grad_norm": 0.8925797939300537, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11682, "tokens_per_second_per_gpu": 10223.86, "total_tokens": 1153376705 }, { "epoch": 0.7303700925231308, "grad_norm": 0.8949022889137268, "learning_rate": 2e-05, "loss": 0.5898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11683, "tokens_per_second_per_gpu": 10581.98, "total_tokens": 1153473578 }, { "epoch": 0.730432608152038, "grad_norm": 0.8492420315742493, "learning_rate": 2e-05, "loss": 0.5772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11684, "tokens_per_second_per_gpu": 9623.86, "total_tokens": 1153568045 }, { "epoch": 0.7304951237809453, "grad_norm": 0.8818102478981018, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11685, "tokens_per_second_per_gpu": 10575.22, "total_tokens": 1153665893 }, { "epoch": 0.7305576394098524, "grad_norm": 0.9108352661132812, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11686, "tokens_per_second_per_gpu": 10538.14, "total_tokens": 1153763869 }, { "epoch": 0.7306201550387597, "grad_norm": 0.9011589288711548, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11687, "tokens_per_second_per_gpu": 10442.87, "total_tokens": 1153863972 }, { "epoch": 0.7306826706676669, "grad_norm": 0.9146543145179749, "learning_rate": 2e-05, "loss": 0.5914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11688, "tokens_per_second_per_gpu": 10255.23, "total_tokens": 1153962858 }, { "epoch": 0.7307451862965741, "grad_norm": 0.9070571064949036, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11689, "tokens_per_second_per_gpu": 11048.31, "total_tokens": 1154062388 }, { "epoch": 0.7308077019254814, "grad_norm": 0.8595972061157227, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11690, "tokens_per_second_per_gpu": 10547.42, "total_tokens": 1154163204 }, { "epoch": 0.7308702175543886, "grad_norm": 0.9018584489822388, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11691, "tokens_per_second_per_gpu": 11075.9, "total_tokens": 1154265541 }, { "epoch": 0.7309327331832958, "grad_norm": 0.8888997435569763, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11692, "tokens_per_second_per_gpu": 10059.8, "total_tokens": 1154364850 }, { "epoch": 0.730995248812203, "grad_norm": 0.870847225189209, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11693, "tokens_per_second_per_gpu": 10307.17, "total_tokens": 1154463228 }, { "epoch": 0.7310577644411103, "grad_norm": 0.9046092629432678, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11694, "tokens_per_second_per_gpu": 10579.61, "total_tokens": 1154563667 }, { "epoch": 0.7311202800700175, "grad_norm": 0.8563384413719177, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11695, "tokens_per_second_per_gpu": 11498.53, "total_tokens": 1154665580 }, { "epoch": 0.7311827956989247, "grad_norm": 0.8785672187805176, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11696, "tokens_per_second_per_gpu": 10451.06, "total_tokens": 1154765383 }, { "epoch": 0.731245311327832, "grad_norm": 0.8745167255401611, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11697, "tokens_per_second_per_gpu": 10412.77, "total_tokens": 1154863896 }, { "epoch": 0.7313078269567392, "grad_norm": 0.9088321328163147, "learning_rate": 2e-05, "loss": 0.6671, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11698, "tokens_per_second_per_gpu": 10717.33, "total_tokens": 1154964183 }, { "epoch": 0.7313703425856464, "grad_norm": 0.8602350950241089, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11699, "tokens_per_second_per_gpu": 11503.9, "total_tokens": 1155065515 }, { "epoch": 0.7314328582145536, "grad_norm": 0.8884825706481934, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11700, "tokens_per_second_per_gpu": 10406.43, "total_tokens": 1155161014 }, { "epoch": 0.7314953738434609, "grad_norm": 0.8759029507637024, "learning_rate": 2e-05, "loss": 0.6011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11701, "tokens_per_second_per_gpu": 10561.27, "total_tokens": 1155259507 }, { "epoch": 0.7315578894723681, "grad_norm": 0.9044011831283569, "learning_rate": 2e-05, "loss": 0.5825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11702, "tokens_per_second_per_gpu": 10203.05, "total_tokens": 1155352057 }, { "epoch": 0.7316204051012754, "grad_norm": 0.8520450592041016, "learning_rate": 2e-05, "loss": 0.5873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11703, "tokens_per_second_per_gpu": 10668.24, "total_tokens": 1155450317 }, { "epoch": 0.7316829207301826, "grad_norm": 0.9055732488632202, "learning_rate": 2e-05, "loss": 0.5863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11704, "tokens_per_second_per_gpu": 9661.67, "total_tokens": 1155547196 }, { "epoch": 0.7317454363590897, "grad_norm": 0.8882726430892944, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11705, "tokens_per_second_per_gpu": 10628.68, "total_tokens": 1155647412 }, { "epoch": 0.731807951987997, "grad_norm": 0.9653422236442566, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11706, "tokens_per_second_per_gpu": 10728.21, "total_tokens": 1155743188 }, { "epoch": 0.7318704676169042, "grad_norm": 0.8721374273300171, "learning_rate": 2e-05, "loss": 0.6478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11707, "tokens_per_second_per_gpu": 10840.28, "total_tokens": 1155846892 }, { "epoch": 0.7319329832458115, "grad_norm": 0.8891928791999817, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11708, "tokens_per_second_per_gpu": 10932.11, "total_tokens": 1155950355 }, { "epoch": 0.7319954988747187, "grad_norm": 0.8929769396781921, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11709, "tokens_per_second_per_gpu": 10101.59, "total_tokens": 1156046883 }, { "epoch": 0.732058014503626, "grad_norm": 0.8724578619003296, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11710, "tokens_per_second_per_gpu": 10872.85, "total_tokens": 1156148290 }, { "epoch": 0.7321205301325331, "grad_norm": 0.8630757331848145, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11711, "tokens_per_second_per_gpu": 9674.98, "total_tokens": 1156246338 }, { "epoch": 0.7321830457614403, "grad_norm": 0.8489198088645935, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11712, "tokens_per_second_per_gpu": 11461.94, "total_tokens": 1156352496 }, { "epoch": 0.7322455613903476, "grad_norm": 0.84952712059021, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11713, "tokens_per_second_per_gpu": 10802.37, "total_tokens": 1156456144 }, { "epoch": 0.7323080770192548, "grad_norm": 0.8591519594192505, "learning_rate": 2e-05, "loss": 0.6457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11714, "tokens_per_second_per_gpu": 11354.21, "total_tokens": 1156559642 }, { "epoch": 0.7323705926481621, "grad_norm": 0.8812222480773926, "learning_rate": 2e-05, "loss": 0.5989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11715, "tokens_per_second_per_gpu": 10303.23, "total_tokens": 1156658522 }, { "epoch": 0.7324331082770693, "grad_norm": 0.8896636366844177, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11716, "tokens_per_second_per_gpu": 11226.59, "total_tokens": 1156759090 }, { "epoch": 0.7324956239059764, "grad_norm": 0.8949344158172607, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11717, "tokens_per_second_per_gpu": 10150.58, "total_tokens": 1156859479 }, { "epoch": 0.7325581395348837, "grad_norm": 0.9037944674491882, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11718, "tokens_per_second_per_gpu": 10184.07, "total_tokens": 1156959086 }, { "epoch": 0.7326206551637909, "grad_norm": 0.91167151927948, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11719, "tokens_per_second_per_gpu": 10043.76, "total_tokens": 1157057065 }, { "epoch": 0.7326831707926982, "grad_norm": 0.8783972859382629, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11720, "tokens_per_second_per_gpu": 10198.35, "total_tokens": 1157156377 }, { "epoch": 0.7327456864216054, "grad_norm": 0.8678976893424988, "learning_rate": 2e-05, "loss": 0.5987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11721, "tokens_per_second_per_gpu": 10341.76, "total_tokens": 1157253464 }, { "epoch": 0.7328082020505127, "grad_norm": 0.9067542552947998, "learning_rate": 2e-05, "loss": 0.6761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11722, "tokens_per_second_per_gpu": 10928.19, "total_tokens": 1157356831 }, { "epoch": 0.7328707176794198, "grad_norm": 0.8967226147651672, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11723, "tokens_per_second_per_gpu": 11314.7, "total_tokens": 1157459585 }, { "epoch": 0.732933233308327, "grad_norm": 0.9167718887329102, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11724, "tokens_per_second_per_gpu": 10638.1, "total_tokens": 1157554579 }, { "epoch": 0.7329957489372343, "grad_norm": 0.8893719911575317, "learning_rate": 2e-05, "loss": 0.6489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11725, "tokens_per_second_per_gpu": 11168.39, "total_tokens": 1157654793 }, { "epoch": 0.7330582645661415, "grad_norm": 0.8695188760757446, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11726, "tokens_per_second_per_gpu": 10491.91, "total_tokens": 1157754464 }, { "epoch": 0.7331207801950488, "grad_norm": 0.888869047164917, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11727, "tokens_per_second_per_gpu": 10045.1, "total_tokens": 1157850618 }, { "epoch": 0.733183295823956, "grad_norm": 0.9840679168701172, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11728, "tokens_per_second_per_gpu": 9263.75, "total_tokens": 1157947089 }, { "epoch": 0.7332458114528632, "grad_norm": 0.8970162272453308, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11729, "tokens_per_second_per_gpu": 11466.38, "total_tokens": 1158052163 }, { "epoch": 0.7333083270817704, "grad_norm": 0.8762009143829346, "learning_rate": 2e-05, "loss": 0.652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11730, "tokens_per_second_per_gpu": 10460.58, "total_tokens": 1158150941 }, { "epoch": 0.7333708427106777, "grad_norm": 0.9177287817001343, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11731, "tokens_per_second_per_gpu": 10439.78, "total_tokens": 1158249726 }, { "epoch": 0.7334333583395849, "grad_norm": 0.894642174243927, "learning_rate": 2e-05, "loss": 0.5962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11732, "tokens_per_second_per_gpu": 10379.11, "total_tokens": 1158347597 }, { "epoch": 0.7334958739684921, "grad_norm": 0.9155134558677673, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11733, "tokens_per_second_per_gpu": 10127.62, "total_tokens": 1158441882 }, { "epoch": 0.7335583895973994, "grad_norm": 0.9064366817474365, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11734, "tokens_per_second_per_gpu": 10347.58, "total_tokens": 1158540377 }, { "epoch": 0.7336209052263066, "grad_norm": 0.8649728298187256, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11735, "tokens_per_second_per_gpu": 9896.06, "total_tokens": 1158638939 }, { "epoch": 0.7336834208552138, "grad_norm": 0.8985240459442139, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11736, "tokens_per_second_per_gpu": 10454.62, "total_tokens": 1158734399 }, { "epoch": 0.733745936484121, "grad_norm": 0.8961226940155029, "learning_rate": 2e-05, "loss": 0.5853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11737, "tokens_per_second_per_gpu": 10219.71, "total_tokens": 1158830108 }, { "epoch": 0.7338084521130283, "grad_norm": 0.8971286416053772, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11738, "tokens_per_second_per_gpu": 10526.3, "total_tokens": 1158927005 }, { "epoch": 0.7338709677419355, "grad_norm": 0.9078065752983093, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11739, "tokens_per_second_per_gpu": 11061.67, "total_tokens": 1159023081 }, { "epoch": 0.7339334833708427, "grad_norm": 0.9149242043495178, "learning_rate": 2e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11740, "tokens_per_second_per_gpu": 9999.42, "total_tokens": 1159115847 }, { "epoch": 0.73399599899975, "grad_norm": 0.9042466878890991, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11741, "tokens_per_second_per_gpu": 10250.53, "total_tokens": 1159213446 }, { "epoch": 0.7340585146286571, "grad_norm": 0.9078549146652222, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11742, "tokens_per_second_per_gpu": 10004.18, "total_tokens": 1159314077 }, { "epoch": 0.7341210302575644, "grad_norm": 0.8628212213516235, "learning_rate": 2e-05, "loss": 0.5915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11743, "tokens_per_second_per_gpu": 10553.64, "total_tokens": 1159416802 }, { "epoch": 0.7341835458864716, "grad_norm": 0.888322114944458, "learning_rate": 2e-05, "loss": 0.591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11744, "tokens_per_second_per_gpu": 11428.79, "total_tokens": 1159513892 }, { "epoch": 0.7342460615153789, "grad_norm": 0.9162082672119141, "learning_rate": 2e-05, "loss": 0.568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11745, "tokens_per_second_per_gpu": 10432.04, "total_tokens": 1159609900 }, { "epoch": 0.7343085771442861, "grad_norm": 0.8881021738052368, "learning_rate": 2e-05, "loss": 0.5753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11746, "tokens_per_second_per_gpu": 9571.19, "total_tokens": 1159703136 }, { "epoch": 0.7343710927731933, "grad_norm": 0.8499166965484619, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11747, "tokens_per_second_per_gpu": 11469.95, "total_tokens": 1159806218 }, { "epoch": 0.7344336084021005, "grad_norm": 0.941520094871521, "learning_rate": 2e-05, "loss": 0.6477, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11748, "tokens_per_second_per_gpu": 9532.62, "total_tokens": 1159897202 }, { "epoch": 0.7344961240310077, "grad_norm": 0.8953816294670105, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11749, "tokens_per_second_per_gpu": 9584.64, "total_tokens": 1159992242 }, { "epoch": 0.734558639659915, "grad_norm": 0.8632791638374329, "learning_rate": 2e-05, "loss": 0.5773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11750, "tokens_per_second_per_gpu": 10324.66, "total_tokens": 1160092915 }, { "epoch": 0.7346211552888222, "grad_norm": 0.9442422389984131, "learning_rate": 2e-05, "loss": 0.5848, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11751, "tokens_per_second_per_gpu": 10764.52, "total_tokens": 1160181567 }, { "epoch": 0.7346836709177295, "grad_norm": 0.9184309244155884, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11752, "tokens_per_second_per_gpu": 10547.5, "total_tokens": 1160278540 }, { "epoch": 0.7347461865466367, "grad_norm": 0.8446224331855774, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11753, "tokens_per_second_per_gpu": 11119.99, "total_tokens": 1160378938 }, { "epoch": 0.7348087021755438, "grad_norm": 0.8761544227600098, "learning_rate": 2e-05, "loss": 0.5738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11754, "tokens_per_second_per_gpu": 10058.57, "total_tokens": 1160474559 }, { "epoch": 0.7348712178044511, "grad_norm": 0.8950127363204956, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11755, "tokens_per_second_per_gpu": 10921.6, "total_tokens": 1160576414 }, { "epoch": 0.7349337334333583, "grad_norm": 0.9472100138664246, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11756, "tokens_per_second_per_gpu": 10399.79, "total_tokens": 1160672158 }, { "epoch": 0.7349962490622656, "grad_norm": 0.8793039321899414, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11757, "tokens_per_second_per_gpu": 10522.71, "total_tokens": 1160771040 }, { "epoch": 0.7350587646911728, "grad_norm": 0.9282117486000061, "learning_rate": 2e-05, "loss": 0.5929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11758, "tokens_per_second_per_gpu": 10260.16, "total_tokens": 1160869827 }, { "epoch": 0.7351212803200801, "grad_norm": 0.9072407484054565, "learning_rate": 2e-05, "loss": 0.6219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11759, "tokens_per_second_per_gpu": 10865.99, "total_tokens": 1160966848 }, { "epoch": 0.7351837959489872, "grad_norm": 0.8977416753768921, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11760, "tokens_per_second_per_gpu": 10688.46, "total_tokens": 1161066661 }, { "epoch": 0.7352463115778944, "grad_norm": 0.8940778970718384, "learning_rate": 2e-05, "loss": 0.6914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11761, "tokens_per_second_per_gpu": 10294.84, "total_tokens": 1161169547 }, { "epoch": 0.7353088272068017, "grad_norm": 0.8557479977607727, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11762, "tokens_per_second_per_gpu": 11386.45, "total_tokens": 1161271445 }, { "epoch": 0.7353713428357089, "grad_norm": 0.8936730027198792, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11763, "tokens_per_second_per_gpu": 10930.87, "total_tokens": 1161372454 }, { "epoch": 0.7354338584646162, "grad_norm": 0.874844491481781, "learning_rate": 2e-05, "loss": 0.5908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11764, "tokens_per_second_per_gpu": 9846.08, "total_tokens": 1161466791 }, { "epoch": 0.7354963740935234, "grad_norm": 0.9101529717445374, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11765, "tokens_per_second_per_gpu": 10780.34, "total_tokens": 1161567202 }, { "epoch": 0.7355588897224306, "grad_norm": 0.8530718088150024, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11766, "tokens_per_second_per_gpu": 10437.48, "total_tokens": 1161666476 }, { "epoch": 0.7356214053513378, "grad_norm": 0.8968429565429688, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11767, "tokens_per_second_per_gpu": 11160.11, "total_tokens": 1161768628 }, { "epoch": 0.735683920980245, "grad_norm": 0.8990703821182251, "learning_rate": 2e-05, "loss": 0.5875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11768, "tokens_per_second_per_gpu": 9937.55, "total_tokens": 1161866058 }, { "epoch": 0.7357464366091523, "grad_norm": 0.8818985819816589, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11769, "tokens_per_second_per_gpu": 11345.93, "total_tokens": 1161970931 }, { "epoch": 0.7358089522380595, "grad_norm": 0.8669441342353821, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11770, "tokens_per_second_per_gpu": 10490.9, "total_tokens": 1162070360 }, { "epoch": 0.7358714678669668, "grad_norm": 0.8546521663665771, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11771, "tokens_per_second_per_gpu": 11146.03, "total_tokens": 1162172054 }, { "epoch": 0.735933983495874, "grad_norm": 0.8718418478965759, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11772, "tokens_per_second_per_gpu": 10992.24, "total_tokens": 1162271835 }, { "epoch": 0.7359964991247812, "grad_norm": 0.9049972891807556, "learning_rate": 2e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11773, "tokens_per_second_per_gpu": 10255.08, "total_tokens": 1162373313 }, { "epoch": 0.7360590147536884, "grad_norm": 0.8893167972564697, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11774, "tokens_per_second_per_gpu": 10310.49, "total_tokens": 1162473852 }, { "epoch": 0.7361215303825956, "grad_norm": 0.9184991121292114, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11775, "tokens_per_second_per_gpu": 10636.67, "total_tokens": 1162574363 }, { "epoch": 0.7361840460115029, "grad_norm": 0.8986454010009766, "learning_rate": 2e-05, "loss": 0.6318, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11776, "tokens_per_second_per_gpu": 10451.55, "total_tokens": 1162673222 }, { "epoch": 0.7362465616404101, "grad_norm": 0.9111918210983276, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11777, "tokens_per_second_per_gpu": 11494.43, "total_tokens": 1162771183 }, { "epoch": 0.7363090772693174, "grad_norm": 0.8608810305595398, "learning_rate": 2e-05, "loss": 0.6089, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11778, "tokens_per_second_per_gpu": 10651.0, "total_tokens": 1162872631 }, { "epoch": 0.7363715928982245, "grad_norm": 0.866904079914093, "learning_rate": 2e-05, "loss": 0.5869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11779, "tokens_per_second_per_gpu": 10164.95, "total_tokens": 1162970267 }, { "epoch": 0.7364341085271318, "grad_norm": 0.9233233332633972, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11780, "tokens_per_second_per_gpu": 10558.88, "total_tokens": 1163068579 }, { "epoch": 0.736496624156039, "grad_norm": 0.9009566903114319, "learning_rate": 2e-05, "loss": 0.659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11781, "tokens_per_second_per_gpu": 10435.15, "total_tokens": 1163166867 }, { "epoch": 0.7365591397849462, "grad_norm": 0.8683804869651794, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11782, "tokens_per_second_per_gpu": 11095.54, "total_tokens": 1163264681 }, { "epoch": 0.7366216554138535, "grad_norm": 0.8662635087966919, "learning_rate": 2e-05, "loss": 0.5974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11783, "tokens_per_second_per_gpu": 11075.29, "total_tokens": 1163363338 }, { "epoch": 0.7366841710427607, "grad_norm": 0.9146623611450195, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11784, "tokens_per_second_per_gpu": 10127.04, "total_tokens": 1163457077 }, { "epoch": 0.7367466866716679, "grad_norm": 0.8876684308052063, "learning_rate": 2e-05, "loss": 0.6667, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11785, "tokens_per_second_per_gpu": 11158.36, "total_tokens": 1163559445 }, { "epoch": 0.7368092023005751, "grad_norm": 0.9146448969841003, "learning_rate": 2e-05, "loss": 0.6489, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11786, "tokens_per_second_per_gpu": 10468.34, "total_tokens": 1163662268 }, { "epoch": 0.7368717179294824, "grad_norm": 0.8531453013420105, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11787, "tokens_per_second_per_gpu": 9726.19, "total_tokens": 1163762346 }, { "epoch": 0.7369342335583896, "grad_norm": 0.8661835789680481, "learning_rate": 2e-05, "loss": 0.6737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11788, "tokens_per_second_per_gpu": 10795.08, "total_tokens": 1163866349 }, { "epoch": 0.7369967491872969, "grad_norm": 0.9022585153579712, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11789, "tokens_per_second_per_gpu": 10071.22, "total_tokens": 1163965296 }, { "epoch": 0.7370592648162041, "grad_norm": 0.8887073993682861, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11790, "tokens_per_second_per_gpu": 11169.45, "total_tokens": 1164063815 }, { "epoch": 0.7371217804451112, "grad_norm": 0.8657543063163757, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11791, "tokens_per_second_per_gpu": 10539.29, "total_tokens": 1164165597 }, { "epoch": 0.7371842960740185, "grad_norm": 0.8971168994903564, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11792, "tokens_per_second_per_gpu": 10294.43, "total_tokens": 1164263813 }, { "epoch": 0.7372468117029257, "grad_norm": 0.8604570031166077, "learning_rate": 2e-05, "loss": 0.587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11793, "tokens_per_second_per_gpu": 9953.11, "total_tokens": 1164363004 }, { "epoch": 0.737309327331833, "grad_norm": 0.9011485576629639, "learning_rate": 2e-05, "loss": 0.6294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11794, "tokens_per_second_per_gpu": 10253.09, "total_tokens": 1164460689 }, { "epoch": 0.7373718429607402, "grad_norm": 0.8847416639328003, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11795, "tokens_per_second_per_gpu": 11495.05, "total_tokens": 1164568257 }, { "epoch": 0.7374343585896475, "grad_norm": 0.8693511486053467, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11796, "tokens_per_second_per_gpu": 11284.44, "total_tokens": 1164670268 }, { "epoch": 0.7374968742185546, "grad_norm": 0.8832747936248779, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11797, "tokens_per_second_per_gpu": 9936.13, "total_tokens": 1164769255 }, { "epoch": 0.7375593898474618, "grad_norm": 0.8799323439598083, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11798, "tokens_per_second_per_gpu": 10704.7, "total_tokens": 1164870993 }, { "epoch": 0.7376219054763691, "grad_norm": 0.870425283908844, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11799, "tokens_per_second_per_gpu": 10648.06, "total_tokens": 1164971477 }, { "epoch": 0.7376844211052763, "grad_norm": 0.9272475838661194, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11800, "tokens_per_second_per_gpu": 10582.84, "total_tokens": 1165071347 }, { "epoch": 0.7377469367341836, "grad_norm": 0.890286386013031, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11801, "tokens_per_second_per_gpu": 10683.58, "total_tokens": 1165173179 }, { "epoch": 0.7378094523630908, "grad_norm": 0.8851302266120911, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11802, "tokens_per_second_per_gpu": 11276.74, "total_tokens": 1165276784 }, { "epoch": 0.737871967991998, "grad_norm": 0.882551372051239, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11803, "tokens_per_second_per_gpu": 9944.37, "total_tokens": 1165374625 }, { "epoch": 0.7379344836209052, "grad_norm": 0.8740715980529785, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11804, "tokens_per_second_per_gpu": 10338.06, "total_tokens": 1165470535 }, { "epoch": 0.7379969992498124, "grad_norm": 0.892667829990387, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11805, "tokens_per_second_per_gpu": 10126.6, "total_tokens": 1165564623 }, { "epoch": 0.7380595148787197, "grad_norm": 0.8730023503303528, "learning_rate": 2e-05, "loss": 0.6459, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11806, "tokens_per_second_per_gpu": 11143.24, "total_tokens": 1165667025 }, { "epoch": 0.7381220305076269, "grad_norm": 0.8913566470146179, "learning_rate": 2e-05, "loss": 0.6406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11807, "tokens_per_second_per_gpu": 10976.85, "total_tokens": 1165764492 }, { "epoch": 0.7381845461365342, "grad_norm": 0.9143913388252258, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11808, "tokens_per_second_per_gpu": 9877.62, "total_tokens": 1165863955 }, { "epoch": 0.7382470617654414, "grad_norm": 0.9069321751594543, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11809, "tokens_per_second_per_gpu": 9591.03, "total_tokens": 1165960693 }, { "epoch": 0.7383095773943485, "grad_norm": 0.9232662916183472, "learning_rate": 2e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11810, "tokens_per_second_per_gpu": 10208.06, "total_tokens": 1166058219 }, { "epoch": 0.7383720930232558, "grad_norm": 0.9175224900245667, "learning_rate": 2e-05, "loss": 0.6881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11811, "tokens_per_second_per_gpu": 9988.62, "total_tokens": 1166155492 }, { "epoch": 0.738434608652163, "grad_norm": 0.9843153357505798, "learning_rate": 2e-05, "loss": 0.6528, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11812, "tokens_per_second_per_gpu": 10719.47, "total_tokens": 1166255453 }, { "epoch": 0.7384971242810703, "grad_norm": 0.8461996912956238, "learning_rate": 2e-05, "loss": 0.6168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11813, "tokens_per_second_per_gpu": 10915.58, "total_tokens": 1166359028 }, { "epoch": 0.7385596399099775, "grad_norm": 0.8799174427986145, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11814, "tokens_per_second_per_gpu": 10650.42, "total_tokens": 1166462300 }, { "epoch": 0.7386221555388848, "grad_norm": 0.889703094959259, "learning_rate": 2e-05, "loss": 0.6057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11815, "tokens_per_second_per_gpu": 10935.56, "total_tokens": 1166563075 }, { "epoch": 0.7386846711677919, "grad_norm": 0.9083669185638428, "learning_rate": 2e-05, "loss": 0.5975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11816, "tokens_per_second_per_gpu": 10340.72, "total_tokens": 1166660072 }, { "epoch": 0.7387471867966992, "grad_norm": 0.9025498628616333, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11817, "tokens_per_second_per_gpu": 9810.65, "total_tokens": 1166758675 }, { "epoch": 0.7388097024256064, "grad_norm": 0.8922668695449829, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11818, "tokens_per_second_per_gpu": 9906.72, "total_tokens": 1166855931 }, { "epoch": 0.7388722180545136, "grad_norm": 0.9005420207977295, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11819, "tokens_per_second_per_gpu": 10855.72, "total_tokens": 1166955116 }, { "epoch": 0.7389347336834209, "grad_norm": 0.8915734887123108, "learning_rate": 2e-05, "loss": 0.6492, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11820, "tokens_per_second_per_gpu": 10774.38, "total_tokens": 1167056810 }, { "epoch": 0.7389972493123281, "grad_norm": 0.8918601870536804, "learning_rate": 2e-05, "loss": 0.6635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11821, "tokens_per_second_per_gpu": 10626.56, "total_tokens": 1167155291 }, { "epoch": 0.7390597649412353, "grad_norm": 0.9193605184555054, "learning_rate": 2e-05, "loss": 0.6553, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11822, "tokens_per_second_per_gpu": 10824.69, "total_tokens": 1167253559 }, { "epoch": 0.7391222805701425, "grad_norm": 0.8827996850013733, "learning_rate": 2e-05, "loss": 0.5995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11823, "tokens_per_second_per_gpu": 9592.47, "total_tokens": 1167355260 }, { "epoch": 0.7391847961990498, "grad_norm": 0.931460976600647, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11824, "tokens_per_second_per_gpu": 10112.38, "total_tokens": 1167452819 }, { "epoch": 0.739247311827957, "grad_norm": 0.923851490020752, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11825, "tokens_per_second_per_gpu": 9952.52, "total_tokens": 1167551723 }, { "epoch": 0.7393098274568642, "grad_norm": 0.8951507210731506, "learning_rate": 2e-05, "loss": 0.6614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11826, "tokens_per_second_per_gpu": 10654.48, "total_tokens": 1167651295 }, { "epoch": 0.7393723430857715, "grad_norm": 0.8822202682495117, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11827, "tokens_per_second_per_gpu": 10362.75, "total_tokens": 1167747802 }, { "epoch": 0.7394348587146786, "grad_norm": 0.9086371660232544, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11828, "tokens_per_second_per_gpu": 10563.02, "total_tokens": 1167846280 }, { "epoch": 0.7394973743435859, "grad_norm": 0.9064545035362244, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11829, "tokens_per_second_per_gpu": 10345.7, "total_tokens": 1167943885 }, { "epoch": 0.7395598899724931, "grad_norm": 0.9303287267684937, "learning_rate": 2e-05, "loss": 0.6924, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11830, "tokens_per_second_per_gpu": 10976.69, "total_tokens": 1168045512 }, { "epoch": 0.7396224056014004, "grad_norm": 0.8881279230117798, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11831, "tokens_per_second_per_gpu": 10856.51, "total_tokens": 1168145969 }, { "epoch": 0.7396849212303076, "grad_norm": 0.9209581017494202, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11832, "tokens_per_second_per_gpu": 10289.25, "total_tokens": 1168246530 }, { "epoch": 0.7397474368592148, "grad_norm": 0.8998818397521973, "learning_rate": 2e-05, "loss": 0.5723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11833, "tokens_per_second_per_gpu": 9763.55, "total_tokens": 1168342394 }, { "epoch": 0.739809952488122, "grad_norm": 0.9067854285240173, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11834, "tokens_per_second_per_gpu": 10780.71, "total_tokens": 1168440503 }, { "epoch": 0.7398724681170292, "grad_norm": 0.9062396883964539, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11835, "tokens_per_second_per_gpu": 10285.21, "total_tokens": 1168535808 }, { "epoch": 0.7399349837459365, "grad_norm": 0.8842565417289734, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11836, "tokens_per_second_per_gpu": 10779.34, "total_tokens": 1168635289 }, { "epoch": 0.7399974993748437, "grad_norm": 0.8927658200263977, "learning_rate": 2e-05, "loss": 0.6355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11837, "tokens_per_second_per_gpu": 10642.0, "total_tokens": 1168738377 }, { "epoch": 0.740060015003751, "grad_norm": 0.8839433789253235, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11838, "tokens_per_second_per_gpu": 10928.12, "total_tokens": 1168840544 }, { "epoch": 0.7401225306326582, "grad_norm": 0.9553301930427551, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11839, "tokens_per_second_per_gpu": 10113.47, "total_tokens": 1168935393 }, { "epoch": 0.7401850462615653, "grad_norm": 0.8550474643707275, "learning_rate": 2e-05, "loss": 0.5927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11840, "tokens_per_second_per_gpu": 9697.52, "total_tokens": 1169032212 }, { "epoch": 0.7402475618904726, "grad_norm": 0.9116918444633484, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11841, "tokens_per_second_per_gpu": 10070.49, "total_tokens": 1169130047 }, { "epoch": 0.7403100775193798, "grad_norm": 0.877610445022583, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11842, "tokens_per_second_per_gpu": 10406.71, "total_tokens": 1169229578 }, { "epoch": 0.7403725931482871, "grad_norm": 0.8998520970344543, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11843, "tokens_per_second_per_gpu": 10967.48, "total_tokens": 1169330333 }, { "epoch": 0.7404351087771943, "grad_norm": 0.8627709746360779, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11844, "tokens_per_second_per_gpu": 10241.97, "total_tokens": 1169433011 }, { "epoch": 0.7404976244061016, "grad_norm": 0.9372463226318359, "learning_rate": 2e-05, "loss": 0.6156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11845, "tokens_per_second_per_gpu": 9490.77, "total_tokens": 1169528565 }, { "epoch": 0.7405601400350088, "grad_norm": 0.855989396572113, "learning_rate": 2e-05, "loss": 0.5664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11846, "tokens_per_second_per_gpu": 10491.37, "total_tokens": 1169627849 }, { "epoch": 0.7406226556639159, "grad_norm": 0.8799766898155212, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11847, "tokens_per_second_per_gpu": 10518.88, "total_tokens": 1169723884 }, { "epoch": 0.7406851712928232, "grad_norm": 0.8834372758865356, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11848, "tokens_per_second_per_gpu": 10549.51, "total_tokens": 1169825503 }, { "epoch": 0.7407476869217304, "grad_norm": 0.8765948414802551, "learning_rate": 2e-05, "loss": 0.5933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11849, "tokens_per_second_per_gpu": 10720.68, "total_tokens": 1169923609 }, { "epoch": 0.7408102025506377, "grad_norm": 0.9438104033470154, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11850, "tokens_per_second_per_gpu": 10660.5, "total_tokens": 1170020581 }, { "epoch": 0.7408727181795449, "grad_norm": 0.8830546736717224, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11851, "tokens_per_second_per_gpu": 11113.32, "total_tokens": 1170120212 }, { "epoch": 0.7409352338084522, "grad_norm": 0.8712311387062073, "learning_rate": 2e-05, "loss": 0.5837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11852, "tokens_per_second_per_gpu": 11003.12, "total_tokens": 1170218955 }, { "epoch": 0.7409977494373593, "grad_norm": 0.8855714797973633, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11853, "tokens_per_second_per_gpu": 10318.37, "total_tokens": 1170317603 }, { "epoch": 0.7410602650662665, "grad_norm": 0.9290133714675903, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11854, "tokens_per_second_per_gpu": 9954.65, "total_tokens": 1170412421 }, { "epoch": 0.7411227806951738, "grad_norm": 0.8921036124229431, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11855, "tokens_per_second_per_gpu": 10551.33, "total_tokens": 1170512307 }, { "epoch": 0.741185296324081, "grad_norm": 0.9639149308204651, "learning_rate": 2e-05, "loss": 0.6886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11856, "tokens_per_second_per_gpu": 10842.55, "total_tokens": 1170613456 }, { "epoch": 0.7412478119529883, "grad_norm": 0.898256778717041, "learning_rate": 2e-05, "loss": 0.6788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11857, "tokens_per_second_per_gpu": 11001.29, "total_tokens": 1170716079 }, { "epoch": 0.7413103275818955, "grad_norm": 0.9052425026893616, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11858, "tokens_per_second_per_gpu": 11233.46, "total_tokens": 1170814755 }, { "epoch": 0.7413728432108027, "grad_norm": 0.8948003649711609, "learning_rate": 2e-05, "loss": 0.6412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11859, "tokens_per_second_per_gpu": 10556.65, "total_tokens": 1170912459 }, { "epoch": 0.7414353588397099, "grad_norm": 0.8900504112243652, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11860, "tokens_per_second_per_gpu": 9913.42, "total_tokens": 1171011796 }, { "epoch": 0.7414978744686171, "grad_norm": 0.8965035080909729, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11861, "tokens_per_second_per_gpu": 10107.2, "total_tokens": 1171112444 }, { "epoch": 0.7415603900975244, "grad_norm": 0.8918569684028625, "learning_rate": 2e-05, "loss": 0.6423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11862, "tokens_per_second_per_gpu": 10475.9, "total_tokens": 1171213961 }, { "epoch": 0.7416229057264316, "grad_norm": 0.8689547777175903, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11863, "tokens_per_second_per_gpu": 11019.7, "total_tokens": 1171316385 }, { "epoch": 0.7416854213553389, "grad_norm": 0.8732792139053345, "learning_rate": 2e-05, "loss": 0.5675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11864, "tokens_per_second_per_gpu": 9878.98, "total_tokens": 1171406978 }, { "epoch": 0.741747936984246, "grad_norm": 0.8974781632423401, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11865, "tokens_per_second_per_gpu": 11356.24, "total_tokens": 1171509040 }, { "epoch": 0.7418104526131533, "grad_norm": 0.8836261630058289, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11866, "tokens_per_second_per_gpu": 10915.64, "total_tokens": 1171605585 }, { "epoch": 0.7418729682420605, "grad_norm": 0.8814653158187866, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11867, "tokens_per_second_per_gpu": 10805.64, "total_tokens": 1171705065 }, { "epoch": 0.7419354838709677, "grad_norm": 0.9022731184959412, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11868, "tokens_per_second_per_gpu": 9635.04, "total_tokens": 1171800567 }, { "epoch": 0.741997999499875, "grad_norm": 0.9247502684593201, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11869, "tokens_per_second_per_gpu": 10665.32, "total_tokens": 1171900210 }, { "epoch": 0.7420605151287822, "grad_norm": 0.8982787132263184, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11870, "tokens_per_second_per_gpu": 10791.51, "total_tokens": 1172000942 }, { "epoch": 0.7421230307576894, "grad_norm": 0.9178593754768372, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11871, "tokens_per_second_per_gpu": 10756.06, "total_tokens": 1172098220 }, { "epoch": 0.7421855463865966, "grad_norm": 0.8837025761604309, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11872, "tokens_per_second_per_gpu": 11057.66, "total_tokens": 1172195589 }, { "epoch": 0.7422480620155039, "grad_norm": 0.8675951361656189, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11873, "tokens_per_second_per_gpu": 10511.43, "total_tokens": 1172296276 }, { "epoch": 0.7423105776444111, "grad_norm": 0.8636503219604492, "learning_rate": 2e-05, "loss": 0.6369, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11874, "tokens_per_second_per_gpu": 10351.46, "total_tokens": 1172400173 }, { "epoch": 0.7423730932733184, "grad_norm": 0.9092756509780884, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11875, "tokens_per_second_per_gpu": 10277.95, "total_tokens": 1172495325 }, { "epoch": 0.7424356089022256, "grad_norm": 0.9322739243507385, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11876, "tokens_per_second_per_gpu": 10455.05, "total_tokens": 1172595571 }, { "epoch": 0.7424981245311327, "grad_norm": 0.9255849123001099, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11877, "tokens_per_second_per_gpu": 10718.04, "total_tokens": 1172693721 }, { "epoch": 0.74256064016004, "grad_norm": 0.8833514451980591, "learning_rate": 2e-05, "loss": 0.6018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11878, "tokens_per_second_per_gpu": 10413.52, "total_tokens": 1172789116 }, { "epoch": 0.7426231557889472, "grad_norm": 0.8673290610313416, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11879, "tokens_per_second_per_gpu": 10578.31, "total_tokens": 1172889835 }, { "epoch": 0.7426856714178545, "grad_norm": 0.8875143527984619, "learning_rate": 2e-05, "loss": 0.5602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11880, "tokens_per_second_per_gpu": 10144.72, "total_tokens": 1172984108 }, { "epoch": 0.7427481870467617, "grad_norm": 0.8858323693275452, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11881, "tokens_per_second_per_gpu": 10910.62, "total_tokens": 1173085735 }, { "epoch": 0.742810702675669, "grad_norm": 0.9198639392852783, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11882, "tokens_per_second_per_gpu": 10341.76, "total_tokens": 1173181354 }, { "epoch": 0.7428732183045762, "grad_norm": 0.8774714469909668, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11883, "tokens_per_second_per_gpu": 10984.39, "total_tokens": 1173279249 }, { "epoch": 0.7429357339334833, "grad_norm": 0.8851131796836853, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11884, "tokens_per_second_per_gpu": 11355.22, "total_tokens": 1173379203 }, { "epoch": 0.7429982495623906, "grad_norm": 0.8986613154411316, "learning_rate": 2e-05, "loss": 0.6486, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11885, "tokens_per_second_per_gpu": 11141.27, "total_tokens": 1173482661 }, { "epoch": 0.7430607651912978, "grad_norm": 0.9014446139335632, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11886, "tokens_per_second_per_gpu": 9176.14, "total_tokens": 1173575775 }, { "epoch": 0.7431232808202051, "grad_norm": 0.8818103671073914, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11887, "tokens_per_second_per_gpu": 10968.65, "total_tokens": 1173680551 }, { "epoch": 0.7431857964491123, "grad_norm": 0.862269937992096, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11888, "tokens_per_second_per_gpu": 11175.1, "total_tokens": 1173784058 }, { "epoch": 0.7432483120780196, "grad_norm": 0.8994187116622925, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11889, "tokens_per_second_per_gpu": 14260.52, "total_tokens": 1173886176 }, { "epoch": 0.7433108277069267, "grad_norm": 0.9265972971916199, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11890, "tokens_per_second_per_gpu": 10524.12, "total_tokens": 1173982429 }, { "epoch": 0.7433733433358339, "grad_norm": 0.9187842607498169, "learning_rate": 2e-05, "loss": 0.6411, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11891, "tokens_per_second_per_gpu": 9911.72, "total_tokens": 1174076373 }, { "epoch": 0.7434358589647412, "grad_norm": 0.8512056469917297, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11892, "tokens_per_second_per_gpu": 11319.76, "total_tokens": 1174180582 }, { "epoch": 0.7434983745936484, "grad_norm": 0.9041271209716797, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11893, "tokens_per_second_per_gpu": 10828.62, "total_tokens": 1174279781 }, { "epoch": 0.7435608902225557, "grad_norm": 0.8721914291381836, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11894, "tokens_per_second_per_gpu": 10950.97, "total_tokens": 1174382506 }, { "epoch": 0.7436234058514629, "grad_norm": 0.8846749663352966, "learning_rate": 2e-05, "loss": 0.5909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11895, "tokens_per_second_per_gpu": 10338.52, "total_tokens": 1174480034 }, { "epoch": 0.74368592148037, "grad_norm": 0.8533231616020203, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11896, "tokens_per_second_per_gpu": 10257.86, "total_tokens": 1174582118 }, { "epoch": 0.7437484371092773, "grad_norm": 0.9920095801353455, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11897, "tokens_per_second_per_gpu": 11097.66, "total_tokens": 1174681329 }, { "epoch": 0.7438109527381845, "grad_norm": 0.9082921147346497, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11898, "tokens_per_second_per_gpu": 11066.43, "total_tokens": 1174783889 }, { "epoch": 0.7438734683670918, "grad_norm": 0.8839054107666016, "learning_rate": 2e-05, "loss": 0.5908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11899, "tokens_per_second_per_gpu": 10979.75, "total_tokens": 1174884550 }, { "epoch": 0.743935983995999, "grad_norm": 0.8952708840370178, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11900, "tokens_per_second_per_gpu": 11077.18, "total_tokens": 1174986087 }, { "epoch": 0.7439984996249063, "grad_norm": 0.905443549156189, "learning_rate": 2e-05, "loss": 0.5987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11901, "tokens_per_second_per_gpu": 10349.28, "total_tokens": 1175083229 }, { "epoch": 0.7440610152538134, "grad_norm": 0.9688570499420166, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11902, "tokens_per_second_per_gpu": 10802.22, "total_tokens": 1175182953 }, { "epoch": 0.7441235308827207, "grad_norm": 0.8886504173278809, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11903, "tokens_per_second_per_gpu": 10639.62, "total_tokens": 1175283940 }, { "epoch": 0.7441860465116279, "grad_norm": 0.9245657324790955, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11904, "tokens_per_second_per_gpu": 11010.8, "total_tokens": 1175380529 }, { "epoch": 0.7442485621405351, "grad_norm": 0.8920294642448425, "learning_rate": 2e-05, "loss": 0.714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11905, "tokens_per_second_per_gpu": 11346.33, "total_tokens": 1175487756 }, { "epoch": 0.7443110777694424, "grad_norm": 0.8710004091262817, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11906, "tokens_per_second_per_gpu": 10712.26, "total_tokens": 1175590258 }, { "epoch": 0.7443735933983496, "grad_norm": 0.9044585824012756, "learning_rate": 2e-05, "loss": 0.6554, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11907, "tokens_per_second_per_gpu": 10148.64, "total_tokens": 1175690188 }, { "epoch": 0.7444361090272568, "grad_norm": 0.8909107446670532, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11908, "tokens_per_second_per_gpu": 10896.62, "total_tokens": 1175791214 }, { "epoch": 0.744498624656164, "grad_norm": 0.8654136061668396, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11909, "tokens_per_second_per_gpu": 10863.51, "total_tokens": 1175893134 }, { "epoch": 0.7445611402850713, "grad_norm": 0.8686676621437073, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11910, "tokens_per_second_per_gpu": 10285.6, "total_tokens": 1175991873 }, { "epoch": 0.7446236559139785, "grad_norm": 0.8763249516487122, "learning_rate": 2e-05, "loss": 0.5869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11911, "tokens_per_second_per_gpu": 9669.28, "total_tokens": 1176085951 }, { "epoch": 0.7446861715428857, "grad_norm": 0.8900096416473389, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11912, "tokens_per_second_per_gpu": 9229.86, "total_tokens": 1176183550 }, { "epoch": 0.744748687171793, "grad_norm": 0.9511562585830688, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11913, "tokens_per_second_per_gpu": 9922.97, "total_tokens": 1176279303 }, { "epoch": 0.7448112028007001, "grad_norm": 0.891502857208252, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11914, "tokens_per_second_per_gpu": 10792.15, "total_tokens": 1176380130 }, { "epoch": 0.7448737184296074, "grad_norm": 0.9017487168312073, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11915, "tokens_per_second_per_gpu": 10764.58, "total_tokens": 1176479046 }, { "epoch": 0.7449362340585146, "grad_norm": 0.9009154438972473, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11916, "tokens_per_second_per_gpu": 10541.75, "total_tokens": 1176578735 }, { "epoch": 0.7449987496874219, "grad_norm": 0.9607310891151428, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11917, "tokens_per_second_per_gpu": 10853.66, "total_tokens": 1176680931 }, { "epoch": 0.7450612653163291, "grad_norm": 0.9263550639152527, "learning_rate": 2e-05, "loss": 0.5999, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11918, "tokens_per_second_per_gpu": 10049.62, "total_tokens": 1176776013 }, { "epoch": 0.7451237809452363, "grad_norm": 0.8985634446144104, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11919, "tokens_per_second_per_gpu": 10910.94, "total_tokens": 1176875366 }, { "epoch": 0.7451862965741435, "grad_norm": 0.90157151222229, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11920, "tokens_per_second_per_gpu": 9528.18, "total_tokens": 1176972566 }, { "epoch": 0.7452488122030507, "grad_norm": 0.9477292895317078, "learning_rate": 2e-05, "loss": 0.6461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11921, "tokens_per_second_per_gpu": 10449.96, "total_tokens": 1177070661 }, { "epoch": 0.745311327831958, "grad_norm": 0.8773574829101562, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11922, "tokens_per_second_per_gpu": 10957.78, "total_tokens": 1177173668 }, { "epoch": 0.7453738434608652, "grad_norm": 0.8823241591453552, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11923, "tokens_per_second_per_gpu": 11121.42, "total_tokens": 1177274067 }, { "epoch": 0.7454363590897725, "grad_norm": 0.8761352300643921, "learning_rate": 2e-05, "loss": 0.6431, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11924, "tokens_per_second_per_gpu": 10724.66, "total_tokens": 1177375316 }, { "epoch": 0.7454988747186797, "grad_norm": 0.8801342844963074, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11925, "tokens_per_second_per_gpu": 9792.36, "total_tokens": 1177472482 }, { "epoch": 0.745561390347587, "grad_norm": 0.8608586192131042, "learning_rate": 2e-05, "loss": 0.5902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11926, "tokens_per_second_per_gpu": 11006.12, "total_tokens": 1177575389 }, { "epoch": 0.7456239059764941, "grad_norm": 0.875796377658844, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11927, "tokens_per_second_per_gpu": 11233.04, "total_tokens": 1177679347 }, { "epoch": 0.7456864216054013, "grad_norm": 0.908814549446106, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11928, "tokens_per_second_per_gpu": 11142.79, "total_tokens": 1177779531 }, { "epoch": 0.7457489372343086, "grad_norm": 0.8975768089294434, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11929, "tokens_per_second_per_gpu": 10442.31, "total_tokens": 1177878114 }, { "epoch": 0.7458114528632158, "grad_norm": 0.9331971406936646, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11930, "tokens_per_second_per_gpu": 9823.92, "total_tokens": 1177973368 }, { "epoch": 0.7458739684921231, "grad_norm": 0.9338298439979553, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11931, "tokens_per_second_per_gpu": 10338.42, "total_tokens": 1178071502 }, { "epoch": 0.7459364841210303, "grad_norm": 0.9228571653366089, "learning_rate": 2e-05, "loss": 0.6036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11932, "tokens_per_second_per_gpu": 10254.61, "total_tokens": 1178166236 }, { "epoch": 0.7459989997499374, "grad_norm": 0.9026406407356262, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11933, "tokens_per_second_per_gpu": 10857.81, "total_tokens": 1178262822 }, { "epoch": 0.7460615153788447, "grad_norm": 0.837849497795105, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11934, "tokens_per_second_per_gpu": 11006.66, "total_tokens": 1178367167 }, { "epoch": 0.7461240310077519, "grad_norm": 0.90361088514328, "learning_rate": 2e-05, "loss": 0.5537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11935, "tokens_per_second_per_gpu": 10227.3, "total_tokens": 1178458792 }, { "epoch": 0.7461865466366592, "grad_norm": 0.8549278974533081, "learning_rate": 2e-05, "loss": 0.6026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11936, "tokens_per_second_per_gpu": 10877.95, "total_tokens": 1178562523 }, { "epoch": 0.7462490622655664, "grad_norm": 0.9252904653549194, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11937, "tokens_per_second_per_gpu": 9841.63, "total_tokens": 1178657550 }, { "epoch": 0.7463115778944737, "grad_norm": 0.9046713709831238, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11938, "tokens_per_second_per_gpu": 10561.44, "total_tokens": 1178754516 }, { "epoch": 0.7463740935233808, "grad_norm": 0.8968455791473389, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11939, "tokens_per_second_per_gpu": 10497.04, "total_tokens": 1178854277 }, { "epoch": 0.746436609152288, "grad_norm": 0.9139543175697327, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11940, "tokens_per_second_per_gpu": 11242.88, "total_tokens": 1178953025 }, { "epoch": 0.7464991247811953, "grad_norm": 0.8791401386260986, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11941, "tokens_per_second_per_gpu": 10958.2, "total_tokens": 1179054553 }, { "epoch": 0.7465616404101025, "grad_norm": 0.8773560523986816, "learning_rate": 2e-05, "loss": 0.6577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11942, "tokens_per_second_per_gpu": 11052.24, "total_tokens": 1179157069 }, { "epoch": 0.7466241560390098, "grad_norm": 0.8957542777061462, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11943, "tokens_per_second_per_gpu": 10511.34, "total_tokens": 1179255838 }, { "epoch": 0.746686671667917, "grad_norm": 0.8907886147499084, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11944, "tokens_per_second_per_gpu": 10287.45, "total_tokens": 1179357251 }, { "epoch": 0.7467491872968242, "grad_norm": 0.8679863214492798, "learning_rate": 2e-05, "loss": 0.5889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11945, "tokens_per_second_per_gpu": 10832.94, "total_tokens": 1179457402 }, { "epoch": 0.7468117029257314, "grad_norm": 0.8971551656723022, "learning_rate": 2e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11946, "tokens_per_second_per_gpu": 10879.18, "total_tokens": 1179557786 }, { "epoch": 0.7468742185546386, "grad_norm": 0.8890089392662048, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11947, "tokens_per_second_per_gpu": 10966.0, "total_tokens": 1179654346 }, { "epoch": 0.7469367341835459, "grad_norm": 0.9232653975486755, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11948, "tokens_per_second_per_gpu": 10477.34, "total_tokens": 1179753164 }, { "epoch": 0.7469992498124531, "grad_norm": 0.9088094830513, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11949, "tokens_per_second_per_gpu": 10055.91, "total_tokens": 1179849370 }, { "epoch": 0.7470617654413604, "grad_norm": 0.9275312423706055, "learning_rate": 2e-05, "loss": 0.66, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11950, "tokens_per_second_per_gpu": 10074.05, "total_tokens": 1179945920 }, { "epoch": 0.7471242810702675, "grad_norm": 0.8975936770439148, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11951, "tokens_per_second_per_gpu": 9777.52, "total_tokens": 1180042047 }, { "epoch": 0.7471867966991748, "grad_norm": 0.8680588603019714, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11952, "tokens_per_second_per_gpu": 10746.52, "total_tokens": 1180143989 }, { "epoch": 0.747249312328082, "grad_norm": 0.8622031211853027, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11953, "tokens_per_second_per_gpu": 10205.88, "total_tokens": 1180243708 }, { "epoch": 0.7473118279569892, "grad_norm": 0.8407061100006104, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11954, "tokens_per_second_per_gpu": 10236.77, "total_tokens": 1180341082 }, { "epoch": 0.7473743435858965, "grad_norm": 0.8787227272987366, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11955, "tokens_per_second_per_gpu": 9934.49, "total_tokens": 1180439101 }, { "epoch": 0.7474368592148037, "grad_norm": 0.8995330333709717, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11956, "tokens_per_second_per_gpu": 9969.03, "total_tokens": 1180530934 }, { "epoch": 0.7474993748437109, "grad_norm": 0.8735114336013794, "learning_rate": 2e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11957, "tokens_per_second_per_gpu": 11365.85, "total_tokens": 1180627374 }, { "epoch": 0.7475618904726181, "grad_norm": 0.884270966053009, "learning_rate": 2e-05, "loss": 0.5738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11958, "tokens_per_second_per_gpu": 10415.16, "total_tokens": 1180725139 }, { "epoch": 0.7476244061015254, "grad_norm": 0.8766347169876099, "learning_rate": 2e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11959, "tokens_per_second_per_gpu": 9586.89, "total_tokens": 1180821713 }, { "epoch": 0.7476869217304326, "grad_norm": 0.8682388663291931, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11960, "tokens_per_second_per_gpu": 11049.55, "total_tokens": 1180925737 }, { "epoch": 0.7477494373593399, "grad_norm": 0.9167034029960632, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11961, "tokens_per_second_per_gpu": 10624.01, "total_tokens": 1181021959 }, { "epoch": 0.7478119529882471, "grad_norm": 0.8740867972373962, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11962, "tokens_per_second_per_gpu": 11155.79, "total_tokens": 1181119902 }, { "epoch": 0.7478744686171543, "grad_norm": 0.93470299243927, "learning_rate": 2e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11963, "tokens_per_second_per_gpu": 10235.41, "total_tokens": 1181216198 }, { "epoch": 0.7479369842460615, "grad_norm": 0.8689917325973511, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11964, "tokens_per_second_per_gpu": 10763.59, "total_tokens": 1181312688 }, { "epoch": 0.7479994998749687, "grad_norm": 0.8625820875167847, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11965, "tokens_per_second_per_gpu": 10443.53, "total_tokens": 1181410933 }, { "epoch": 0.748062015503876, "grad_norm": 0.8464499711990356, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11966, "tokens_per_second_per_gpu": 11121.91, "total_tokens": 1181516402 }, { "epoch": 0.7481245311327832, "grad_norm": 0.9208492040634155, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11967, "tokens_per_second_per_gpu": 10569.24, "total_tokens": 1181617015 }, { "epoch": 0.7481870467616905, "grad_norm": 0.900549054145813, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11968, "tokens_per_second_per_gpu": 10111.31, "total_tokens": 1181713248 }, { "epoch": 0.7482495623905977, "grad_norm": 0.8906060457229614, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11969, "tokens_per_second_per_gpu": 10447.3, "total_tokens": 1181807100 }, { "epoch": 0.7483120780195048, "grad_norm": 0.8691836595535278, "learning_rate": 2e-05, "loss": 0.5916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11970, "tokens_per_second_per_gpu": 10722.86, "total_tokens": 1181905406 }, { "epoch": 0.7483745936484121, "grad_norm": 0.9451886415481567, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11971, "tokens_per_second_per_gpu": 10102.1, "total_tokens": 1182004089 }, { "epoch": 0.7484371092773193, "grad_norm": 0.9467660188674927, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11972, "tokens_per_second_per_gpu": 10997.16, "total_tokens": 1182106025 }, { "epoch": 0.7484996249062266, "grad_norm": 0.8907616138458252, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11973, "tokens_per_second_per_gpu": 10427.36, "total_tokens": 1182206922 }, { "epoch": 0.7485621405351338, "grad_norm": 0.8732290267944336, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11974, "tokens_per_second_per_gpu": 10266.09, "total_tokens": 1182304870 }, { "epoch": 0.7486246561640411, "grad_norm": 0.8831419944763184, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11975, "tokens_per_second_per_gpu": 10730.98, "total_tokens": 1182406808 }, { "epoch": 0.7486871717929482, "grad_norm": 0.8950927257537842, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11976, "tokens_per_second_per_gpu": 10774.19, "total_tokens": 1182505288 }, { "epoch": 0.7487496874218554, "grad_norm": 0.8975294828414917, "learning_rate": 2e-05, "loss": 0.5877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11977, "tokens_per_second_per_gpu": 10599.16, "total_tokens": 1182602044 }, { "epoch": 0.7488122030507627, "grad_norm": 0.9034609794616699, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11978, "tokens_per_second_per_gpu": 10482.3, "total_tokens": 1182702912 }, { "epoch": 0.7488747186796699, "grad_norm": 0.8599186539649963, "learning_rate": 2e-05, "loss": 0.5927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11979, "tokens_per_second_per_gpu": 10631.86, "total_tokens": 1182806364 }, { "epoch": 0.7489372343085772, "grad_norm": 0.9049424529075623, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11980, "tokens_per_second_per_gpu": 11342.41, "total_tokens": 1182908822 }, { "epoch": 0.7489997499374844, "grad_norm": 0.9231562614440918, "learning_rate": 2e-05, "loss": 0.6513, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11981, "tokens_per_second_per_gpu": 9963.25, "total_tokens": 1183006981 }, { "epoch": 0.7490622655663915, "grad_norm": 0.9238401055335999, "learning_rate": 2e-05, "loss": 0.6551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11982, "tokens_per_second_per_gpu": 11066.37, "total_tokens": 1183105765 }, { "epoch": 0.7491247811952988, "grad_norm": 0.9210029244422913, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11983, "tokens_per_second_per_gpu": 10872.14, "total_tokens": 1183203513 }, { "epoch": 0.749187296824206, "grad_norm": 0.8601365089416504, "learning_rate": 2e-05, "loss": 0.5712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11984, "tokens_per_second_per_gpu": 10203.86, "total_tokens": 1183300247 }, { "epoch": 0.7492498124531133, "grad_norm": 0.8858047723770142, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11985, "tokens_per_second_per_gpu": 10896.32, "total_tokens": 1183398418 }, { "epoch": 0.7493123280820205, "grad_norm": 0.8821138739585876, "learning_rate": 2e-05, "loss": 0.6964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11986, "tokens_per_second_per_gpu": 10808.05, "total_tokens": 1183502073 }, { "epoch": 0.7493748437109278, "grad_norm": 0.8654041290283203, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11987, "tokens_per_second_per_gpu": 10593.48, "total_tokens": 1183604950 }, { "epoch": 0.7494373593398349, "grad_norm": 0.8790369629859924, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11988, "tokens_per_second_per_gpu": 10384.06, "total_tokens": 1183702885 }, { "epoch": 0.7494998749687422, "grad_norm": 0.9041138291358948, "learning_rate": 2e-05, "loss": 0.6156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11989, "tokens_per_second_per_gpu": 10587.23, "total_tokens": 1183798297 }, { "epoch": 0.7495623905976494, "grad_norm": 0.8538650274276733, "learning_rate": 2e-05, "loss": 0.5864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11990, "tokens_per_second_per_gpu": 10350.14, "total_tokens": 1183895594 }, { "epoch": 0.7496249062265566, "grad_norm": 0.9011678099632263, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11991, "tokens_per_second_per_gpu": 10991.48, "total_tokens": 1183993573 }, { "epoch": 0.7496874218554639, "grad_norm": 0.8583242297172546, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11992, "tokens_per_second_per_gpu": 10502.6, "total_tokens": 1184094730 }, { "epoch": 0.7497499374843711, "grad_norm": 0.8827404975891113, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11993, "tokens_per_second_per_gpu": 10582.17, "total_tokens": 1184191953 }, { "epoch": 0.7498124531132783, "grad_norm": 0.9042165875434875, "learning_rate": 2e-05, "loss": 0.5677, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11994, "tokens_per_second_per_gpu": 10589.36, "total_tokens": 1184288393 }, { "epoch": 0.7498749687421855, "grad_norm": 0.8973963260650635, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11995, "tokens_per_second_per_gpu": 11459.2, "total_tokens": 1184386192 }, { "epoch": 0.7499374843710928, "grad_norm": 0.8764362931251526, "learning_rate": 2e-05, "loss": 0.5909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11996, "tokens_per_second_per_gpu": 10442.99, "total_tokens": 1184486922 }, { "epoch": 0.75, "grad_norm": 0.892282247543335, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11997, "tokens_per_second_per_gpu": 10362.96, "total_tokens": 1184590050 }, { "epoch": 0.7500625156289072, "grad_norm": 0.9019635915756226, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11998, "tokens_per_second_per_gpu": 9844.69, "total_tokens": 1184684028 }, { "epoch": 0.7501250312578145, "grad_norm": 0.9016615748405457, "learning_rate": 2e-05, "loss": 0.6534, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 11999, "tokens_per_second_per_gpu": 11511.88, "total_tokens": 1184782802 }, { "epoch": 0.7501875468867217, "grad_norm": 0.8492856025695801, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12000, "tokens_per_second_per_gpu": 10654.0, "total_tokens": 1184879210 }, { "epoch": 0.7502500625156289, "grad_norm": 0.8831342458724976, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12001, "tokens_per_second_per_gpu": 10737.79, "total_tokens": 1184979300 }, { "epoch": 0.7503125781445361, "grad_norm": 0.8808624148368835, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12002, "tokens_per_second_per_gpu": 11038.51, "total_tokens": 1185080182 }, { "epoch": 0.7503750937734434, "grad_norm": 0.8835715055465698, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12003, "tokens_per_second_per_gpu": 10500.98, "total_tokens": 1185179368 }, { "epoch": 0.7504376094023506, "grad_norm": 0.8594437837600708, "learning_rate": 2e-05, "loss": 0.5949, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12004, "tokens_per_second_per_gpu": 10214.17, "total_tokens": 1185277221 }, { "epoch": 0.7505001250312578, "grad_norm": 0.8647327423095703, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12005, "tokens_per_second_per_gpu": 10947.84, "total_tokens": 1185377995 }, { "epoch": 0.7505626406601651, "grad_norm": 0.9249345064163208, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12006, "tokens_per_second_per_gpu": 10934.18, "total_tokens": 1185479035 }, { "epoch": 0.7506251562890722, "grad_norm": 0.9189443588256836, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12007, "tokens_per_second_per_gpu": 10505.43, "total_tokens": 1185570414 }, { "epoch": 0.7506876719179795, "grad_norm": 0.8892456889152527, "learning_rate": 2e-05, "loss": 0.6592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12008, "tokens_per_second_per_gpu": 10702.74, "total_tokens": 1185673209 }, { "epoch": 0.7507501875468867, "grad_norm": 0.9391191005706787, "learning_rate": 2e-05, "loss": 0.6548, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12009, "tokens_per_second_per_gpu": 10012.7, "total_tokens": 1185768817 }, { "epoch": 0.750812703175794, "grad_norm": 0.8508843779563904, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12010, "tokens_per_second_per_gpu": 11032.58, "total_tokens": 1185872030 }, { "epoch": 0.7508752188047012, "grad_norm": 0.9009596109390259, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12011, "tokens_per_second_per_gpu": 10867.04, "total_tokens": 1185976355 }, { "epoch": 0.7509377344336085, "grad_norm": 0.8955541253089905, "learning_rate": 2e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12012, "tokens_per_second_per_gpu": 9986.39, "total_tokens": 1186072295 }, { "epoch": 0.7510002500625156, "grad_norm": 0.9130923748016357, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12013, "tokens_per_second_per_gpu": 10489.33, "total_tokens": 1186168183 }, { "epoch": 0.7510627656914228, "grad_norm": 0.9227641820907593, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12014, "tokens_per_second_per_gpu": 10337.65, "total_tokens": 1186264354 }, { "epoch": 0.7511252813203301, "grad_norm": 0.8956117630004883, "learning_rate": 2e-05, "loss": 0.5538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12015, "tokens_per_second_per_gpu": 10200.03, "total_tokens": 1186360626 }, { "epoch": 0.7511877969492373, "grad_norm": 0.8771668672561646, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12016, "tokens_per_second_per_gpu": 10309.75, "total_tokens": 1186459169 }, { "epoch": 0.7512503125781446, "grad_norm": 0.8862643837928772, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12017, "tokens_per_second_per_gpu": 10667.99, "total_tokens": 1186557049 }, { "epoch": 0.7513128282070518, "grad_norm": 0.8713334798812866, "learning_rate": 2e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12018, "tokens_per_second_per_gpu": 10642.69, "total_tokens": 1186656648 }, { "epoch": 0.7513753438359589, "grad_norm": 0.9055154919624329, "learning_rate": 2e-05, "loss": 0.5937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12019, "tokens_per_second_per_gpu": 10934.49, "total_tokens": 1186754692 }, { "epoch": 0.7514378594648662, "grad_norm": 0.8953803777694702, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12020, "tokens_per_second_per_gpu": 10455.62, "total_tokens": 1186853600 }, { "epoch": 0.7515003750937734, "grad_norm": 0.8869965672492981, "learning_rate": 2e-05, "loss": 0.6768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12021, "tokens_per_second_per_gpu": 10079.59, "total_tokens": 1186952279 }, { "epoch": 0.7515628907226807, "grad_norm": 0.9050071239471436, "learning_rate": 2e-05, "loss": 0.5858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12022, "tokens_per_second_per_gpu": 9856.74, "total_tokens": 1187045401 }, { "epoch": 0.7516254063515879, "grad_norm": 0.8992418646812439, "learning_rate": 2e-05, "loss": 0.689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12023, "tokens_per_second_per_gpu": 11048.33, "total_tokens": 1187148896 }, { "epoch": 0.7516879219804952, "grad_norm": 0.8955450654029846, "learning_rate": 2e-05, "loss": 0.6132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12024, "tokens_per_second_per_gpu": 11035.7, "total_tokens": 1187249349 }, { "epoch": 0.7517504376094023, "grad_norm": 0.8985142111778259, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12025, "tokens_per_second_per_gpu": 11259.4, "total_tokens": 1187350204 }, { "epoch": 0.7518129532383095, "grad_norm": 0.8970713019371033, "learning_rate": 2e-05, "loss": 0.5711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12026, "tokens_per_second_per_gpu": 9953.87, "total_tokens": 1187445566 }, { "epoch": 0.7518754688672168, "grad_norm": 0.8918895721435547, "learning_rate": 2e-05, "loss": 0.605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12027, "tokens_per_second_per_gpu": 10969.34, "total_tokens": 1187545530 }, { "epoch": 0.751937984496124, "grad_norm": 0.9090235829353333, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12028, "tokens_per_second_per_gpu": 10517.69, "total_tokens": 1187642758 }, { "epoch": 0.7520005001250313, "grad_norm": 0.9007620811462402, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12029, "tokens_per_second_per_gpu": 10124.21, "total_tokens": 1187743127 }, { "epoch": 0.7520630157539385, "grad_norm": 0.8518876433372498, "learning_rate": 2e-05, "loss": 0.5841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12030, "tokens_per_second_per_gpu": 10455.99, "total_tokens": 1187840111 }, { "epoch": 0.7521255313828457, "grad_norm": 0.9379360675811768, "learning_rate": 2e-05, "loss": 0.6588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12031, "tokens_per_second_per_gpu": 9181.85, "total_tokens": 1187932270 }, { "epoch": 0.7521880470117529, "grad_norm": 0.9105660319328308, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12032, "tokens_per_second_per_gpu": 11126.95, "total_tokens": 1188035170 }, { "epoch": 0.7522505626406601, "grad_norm": 0.8758977055549622, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12033, "tokens_per_second_per_gpu": 10581.1, "total_tokens": 1188137344 }, { "epoch": 0.7523130782695674, "grad_norm": 0.9012897610664368, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12034, "tokens_per_second_per_gpu": 10614.62, "total_tokens": 1188231953 }, { "epoch": 0.7523755938984746, "grad_norm": 0.9126510620117188, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12035, "tokens_per_second_per_gpu": 10724.42, "total_tokens": 1188330556 }, { "epoch": 0.7524381095273819, "grad_norm": 0.8789830207824707, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12036, "tokens_per_second_per_gpu": 9967.11, "total_tokens": 1188427686 }, { "epoch": 0.7525006251562891, "grad_norm": 0.9108850955963135, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12037, "tokens_per_second_per_gpu": 10248.56, "total_tokens": 1188522983 }, { "epoch": 0.7525631407851963, "grad_norm": 0.8920817375183105, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12038, "tokens_per_second_per_gpu": 11011.56, "total_tokens": 1188624609 }, { "epoch": 0.7526256564141035, "grad_norm": 0.9140990972518921, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12039, "tokens_per_second_per_gpu": 10832.38, "total_tokens": 1188721294 }, { "epoch": 0.7526881720430108, "grad_norm": 0.8815776705741882, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12040, "tokens_per_second_per_gpu": 10953.94, "total_tokens": 1188823139 }, { "epoch": 0.752750687671918, "grad_norm": 0.8815196752548218, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12041, "tokens_per_second_per_gpu": 11279.97, "total_tokens": 1188929731 }, { "epoch": 0.7528132033008252, "grad_norm": 0.9112697243690491, "learning_rate": 2e-05, "loss": 0.5981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12042, "tokens_per_second_per_gpu": 10653.06, "total_tokens": 1189028895 }, { "epoch": 0.7528757189297325, "grad_norm": 0.9028517603874207, "learning_rate": 2e-05, "loss": 0.5871, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12043, "tokens_per_second_per_gpu": 10228.21, "total_tokens": 1189125763 }, { "epoch": 0.7529382345586396, "grad_norm": 0.9136362671852112, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12044, "tokens_per_second_per_gpu": 11206.1, "total_tokens": 1189226052 }, { "epoch": 0.7530007501875469, "grad_norm": 0.8768566250801086, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12045, "tokens_per_second_per_gpu": 10296.27, "total_tokens": 1189327581 }, { "epoch": 0.7530632658164541, "grad_norm": 0.8603629469871521, "learning_rate": 2e-05, "loss": 0.5955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12046, "tokens_per_second_per_gpu": 10044.21, "total_tokens": 1189424853 }, { "epoch": 0.7531257814453614, "grad_norm": 0.9066590666770935, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12047, "tokens_per_second_per_gpu": 9316.87, "total_tokens": 1189517022 }, { "epoch": 0.7531882970742686, "grad_norm": 0.880438506603241, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12048, "tokens_per_second_per_gpu": 10261.78, "total_tokens": 1189616776 }, { "epoch": 0.7532508127031758, "grad_norm": 0.8938416242599487, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12049, "tokens_per_second_per_gpu": 10722.91, "total_tokens": 1189716747 }, { "epoch": 0.753313328332083, "grad_norm": 0.9306579232215881, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12050, "tokens_per_second_per_gpu": 9946.57, "total_tokens": 1189811547 }, { "epoch": 0.7533758439609902, "grad_norm": 0.9082613587379456, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12051, "tokens_per_second_per_gpu": 10523.41, "total_tokens": 1189912156 }, { "epoch": 0.7534383595898975, "grad_norm": 0.8380460143089294, "learning_rate": 2e-05, "loss": 0.5866, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12052, "tokens_per_second_per_gpu": 10470.19, "total_tokens": 1190013553 }, { "epoch": 0.7535008752188047, "grad_norm": 0.8508194088935852, "learning_rate": 2e-05, "loss": 0.5777, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12053, "tokens_per_second_per_gpu": 10457.87, "total_tokens": 1190109983 }, { "epoch": 0.753563390847712, "grad_norm": 0.9020437002182007, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12054, "tokens_per_second_per_gpu": 10518.76, "total_tokens": 1190210264 }, { "epoch": 0.7536259064766192, "grad_norm": 0.9334666728973389, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12055, "tokens_per_second_per_gpu": 11097.67, "total_tokens": 1190310696 }, { "epoch": 0.7536884221055263, "grad_norm": 0.9305438995361328, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12056, "tokens_per_second_per_gpu": 10983.83, "total_tokens": 1190410671 }, { "epoch": 0.7537509377344336, "grad_norm": 0.8783225417137146, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12057, "tokens_per_second_per_gpu": 10218.95, "total_tokens": 1190504103 }, { "epoch": 0.7538134533633408, "grad_norm": 0.9295992255210876, "learning_rate": 2e-05, "loss": 0.6515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12058, "tokens_per_second_per_gpu": 10590.2, "total_tokens": 1190605011 }, { "epoch": 0.7538759689922481, "grad_norm": 0.8845528960227966, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12059, "tokens_per_second_per_gpu": 10570.6, "total_tokens": 1190705803 }, { "epoch": 0.7539384846211553, "grad_norm": 0.9077490568161011, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12060, "tokens_per_second_per_gpu": 10481.47, "total_tokens": 1190806175 }, { "epoch": 0.7540010002500626, "grad_norm": 0.874458909034729, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12061, "tokens_per_second_per_gpu": 10095.81, "total_tokens": 1190907004 }, { "epoch": 0.7540635158789697, "grad_norm": 0.9023704528808594, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12062, "tokens_per_second_per_gpu": 10244.63, "total_tokens": 1191006126 }, { "epoch": 0.7541260315078769, "grad_norm": 0.8677769899368286, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12063, "tokens_per_second_per_gpu": 10594.75, "total_tokens": 1191106553 }, { "epoch": 0.7541885471367842, "grad_norm": 0.9223243594169617, "learning_rate": 2e-05, "loss": 0.6506, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12064, "tokens_per_second_per_gpu": 11190.44, "total_tokens": 1191206395 }, { "epoch": 0.7542510627656914, "grad_norm": 0.8615907430648804, "learning_rate": 2e-05, "loss": 0.5855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12065, "tokens_per_second_per_gpu": 10580.2, "total_tokens": 1191303778 }, { "epoch": 0.7543135783945987, "grad_norm": 0.8767644762992859, "learning_rate": 2e-05, "loss": 0.6046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12066, "tokens_per_second_per_gpu": 10811.66, "total_tokens": 1191400351 }, { "epoch": 0.7543760940235059, "grad_norm": 0.9283773899078369, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12067, "tokens_per_second_per_gpu": 11346.29, "total_tokens": 1191501393 }, { "epoch": 0.754438609652413, "grad_norm": 0.9053276181221008, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12068, "tokens_per_second_per_gpu": 10558.09, "total_tokens": 1191602311 }, { "epoch": 0.7545011252813203, "grad_norm": 0.8675528764724731, "learning_rate": 2e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12069, "tokens_per_second_per_gpu": 10525.04, "total_tokens": 1191703047 }, { "epoch": 0.7545636409102275, "grad_norm": 0.9000207781791687, "learning_rate": 2e-05, "loss": 0.5917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12070, "tokens_per_second_per_gpu": 9708.69, "total_tokens": 1191793971 }, { "epoch": 0.7546261565391348, "grad_norm": 0.8871299624443054, "learning_rate": 2e-05, "loss": 0.5831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12071, "tokens_per_second_per_gpu": 10265.9, "total_tokens": 1191890398 }, { "epoch": 0.754688672168042, "grad_norm": 0.9089491367340088, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12072, "tokens_per_second_per_gpu": 9881.7, "total_tokens": 1191981796 }, { "epoch": 0.7547511877969493, "grad_norm": 0.882736325263977, "learning_rate": 2e-05, "loss": 0.6369, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12073, "tokens_per_second_per_gpu": 10879.66, "total_tokens": 1192084258 }, { "epoch": 0.7548137034258565, "grad_norm": 0.8848617076873779, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12074, "tokens_per_second_per_gpu": 10590.17, "total_tokens": 1192182472 }, { "epoch": 0.7548762190547637, "grad_norm": 0.867135763168335, "learning_rate": 2e-05, "loss": 0.6349, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12075, "tokens_per_second_per_gpu": 10511.41, "total_tokens": 1192281330 }, { "epoch": 0.7549387346836709, "grad_norm": 0.8525585532188416, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12076, "tokens_per_second_per_gpu": 11373.47, "total_tokens": 1192384789 }, { "epoch": 0.7550012503125781, "grad_norm": 0.875190794467926, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12077, "tokens_per_second_per_gpu": 10954.19, "total_tokens": 1192484296 }, { "epoch": 0.7550637659414854, "grad_norm": 0.9052025675773621, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12078, "tokens_per_second_per_gpu": 10335.32, "total_tokens": 1192583881 }, { "epoch": 0.7551262815703926, "grad_norm": 0.9024273753166199, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12079, "tokens_per_second_per_gpu": 10649.27, "total_tokens": 1192682415 }, { "epoch": 0.7551887971992999, "grad_norm": 0.8939754366874695, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12080, "tokens_per_second_per_gpu": 10827.85, "total_tokens": 1192780784 }, { "epoch": 0.755251312828207, "grad_norm": 0.9446756839752197, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12081, "tokens_per_second_per_gpu": 10535.82, "total_tokens": 1192879442 }, { "epoch": 0.7553138284571143, "grad_norm": 0.8942492008209229, "learning_rate": 2e-05, "loss": 0.6611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12082, "tokens_per_second_per_gpu": 10980.9, "total_tokens": 1192984947 }, { "epoch": 0.7553763440860215, "grad_norm": 0.914287805557251, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12083, "tokens_per_second_per_gpu": 10029.02, "total_tokens": 1193080887 }, { "epoch": 0.7554388597149287, "grad_norm": 0.8769886493682861, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12084, "tokens_per_second_per_gpu": 10204.86, "total_tokens": 1193181310 }, { "epoch": 0.755501375343836, "grad_norm": 0.8958282470703125, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12085, "tokens_per_second_per_gpu": 10045.33, "total_tokens": 1193281672 }, { "epoch": 0.7555638909727432, "grad_norm": 0.9439014792442322, "learning_rate": 2e-05, "loss": 0.6707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12086, "tokens_per_second_per_gpu": 10106.27, "total_tokens": 1193378792 }, { "epoch": 0.7556264066016504, "grad_norm": 0.8748743534088135, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12087, "tokens_per_second_per_gpu": 10742.33, "total_tokens": 1193480280 }, { "epoch": 0.7556889222305576, "grad_norm": 0.86065274477005, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12088, "tokens_per_second_per_gpu": 11280.63, "total_tokens": 1193582238 }, { "epoch": 0.7557514378594649, "grad_norm": 0.9053778648376465, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12089, "tokens_per_second_per_gpu": 11036.81, "total_tokens": 1193681709 }, { "epoch": 0.7558139534883721, "grad_norm": 0.890417218208313, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12090, "tokens_per_second_per_gpu": 10934.52, "total_tokens": 1193780640 }, { "epoch": 0.7558764691172793, "grad_norm": 0.9443800449371338, "learning_rate": 2e-05, "loss": 0.6874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12091, "tokens_per_second_per_gpu": 11155.79, "total_tokens": 1193883151 }, { "epoch": 0.7559389847461866, "grad_norm": 0.8856903910636902, "learning_rate": 2e-05, "loss": 0.5897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12092, "tokens_per_second_per_gpu": 10391.74, "total_tokens": 1193982141 }, { "epoch": 0.7560015003750937, "grad_norm": 0.8901451826095581, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12093, "tokens_per_second_per_gpu": 10446.67, "total_tokens": 1194084033 }, { "epoch": 0.756064016004001, "grad_norm": 0.92836594581604, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12094, "tokens_per_second_per_gpu": 9945.93, "total_tokens": 1194178944 }, { "epoch": 0.7561265316329082, "grad_norm": 0.8964483141899109, "learning_rate": 2e-05, "loss": 0.6255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12095, "tokens_per_second_per_gpu": 10833.23, "total_tokens": 1194278523 }, { "epoch": 0.7561890472618155, "grad_norm": 0.8916386365890503, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12096, "tokens_per_second_per_gpu": 10858.36, "total_tokens": 1194376316 }, { "epoch": 0.7562515628907227, "grad_norm": 0.9250870943069458, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12097, "tokens_per_second_per_gpu": 10285.27, "total_tokens": 1194476345 }, { "epoch": 0.75631407851963, "grad_norm": 0.9125607013702393, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12098, "tokens_per_second_per_gpu": 10325.67, "total_tokens": 1194575999 }, { "epoch": 0.7563765941485371, "grad_norm": 0.876992404460907, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12099, "tokens_per_second_per_gpu": 11522.31, "total_tokens": 1194676891 }, { "epoch": 0.7564391097774443, "grad_norm": 0.9160414338111877, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12100, "tokens_per_second_per_gpu": 9989.66, "total_tokens": 1194773764 }, { "epoch": 0.7565016254063516, "grad_norm": 0.8911750316619873, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12101, "tokens_per_second_per_gpu": 10529.63, "total_tokens": 1194875389 }, { "epoch": 0.7565641410352588, "grad_norm": 0.8886891603469849, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12102, "tokens_per_second_per_gpu": 10352.55, "total_tokens": 1194971672 }, { "epoch": 0.7566266566641661, "grad_norm": 0.9282520413398743, "learning_rate": 2e-05, "loss": 0.6515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12103, "tokens_per_second_per_gpu": 10406.02, "total_tokens": 1195070254 }, { "epoch": 0.7566891722930733, "grad_norm": 0.9055578112602234, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12104, "tokens_per_second_per_gpu": 10756.85, "total_tokens": 1195168966 }, { "epoch": 0.7567516879219804, "grad_norm": 0.8896064758300781, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12105, "tokens_per_second_per_gpu": 10806.07, "total_tokens": 1195269149 }, { "epoch": 0.7568142035508877, "grad_norm": 0.9338368773460388, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12106, "tokens_per_second_per_gpu": 10787.17, "total_tokens": 1195367337 }, { "epoch": 0.7568767191797949, "grad_norm": 0.887964129447937, "learning_rate": 2e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12107, "tokens_per_second_per_gpu": 11160.4, "total_tokens": 1195468327 }, { "epoch": 0.7569392348087022, "grad_norm": 0.8898128867149353, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12108, "tokens_per_second_per_gpu": 11188.85, "total_tokens": 1195567605 }, { "epoch": 0.7570017504376094, "grad_norm": 0.9203445911407471, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12109, "tokens_per_second_per_gpu": 11257.52, "total_tokens": 1195669091 }, { "epoch": 0.7570642660665167, "grad_norm": 0.9448537826538086, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12110, "tokens_per_second_per_gpu": 9487.63, "total_tokens": 1195762507 }, { "epoch": 0.7571267816954238, "grad_norm": 0.8756207823753357, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12111, "tokens_per_second_per_gpu": 10747.68, "total_tokens": 1195860854 }, { "epoch": 0.757189297324331, "grad_norm": 0.8432711958885193, "learning_rate": 2e-05, "loss": 0.6185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12112, "tokens_per_second_per_gpu": 10996.45, "total_tokens": 1195961387 }, { "epoch": 0.7572518129532383, "grad_norm": 0.844886839389801, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12113, "tokens_per_second_per_gpu": 11276.16, "total_tokens": 1196065827 }, { "epoch": 0.7573143285821455, "grad_norm": 0.9609673619270325, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12114, "tokens_per_second_per_gpu": 10237.54, "total_tokens": 1196162616 }, { "epoch": 0.7573768442110528, "grad_norm": 0.8817139267921448, "learning_rate": 2e-05, "loss": 0.659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12115, "tokens_per_second_per_gpu": 10955.72, "total_tokens": 1196263442 }, { "epoch": 0.75743935983996, "grad_norm": 0.8644649386405945, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12116, "tokens_per_second_per_gpu": 11037.59, "total_tokens": 1196365084 }, { "epoch": 0.7575018754688673, "grad_norm": 0.8839365839958191, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12117, "tokens_per_second_per_gpu": 10161.61, "total_tokens": 1196463865 }, { "epoch": 0.7575643910977744, "grad_norm": 0.8838738203048706, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12118, "tokens_per_second_per_gpu": 10470.77, "total_tokens": 1196563827 }, { "epoch": 0.7576269067266816, "grad_norm": 0.8936984539031982, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12119, "tokens_per_second_per_gpu": 10921.47, "total_tokens": 1196665533 }, { "epoch": 0.7576894223555889, "grad_norm": 0.9676139950752258, "learning_rate": 2e-05, "loss": 0.6495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12120, "tokens_per_second_per_gpu": 11287.13, "total_tokens": 1196764595 }, { "epoch": 0.7577519379844961, "grad_norm": 0.9698945879936218, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12121, "tokens_per_second_per_gpu": 10548.36, "total_tokens": 1196862931 }, { "epoch": 0.7578144536134034, "grad_norm": 0.886834442615509, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12122, "tokens_per_second_per_gpu": 10678.54, "total_tokens": 1196962788 }, { "epoch": 0.7578769692423106, "grad_norm": 0.874051034450531, "learning_rate": 2e-05, "loss": 0.5994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12123, "tokens_per_second_per_gpu": 10808.14, "total_tokens": 1197062876 }, { "epoch": 0.7579394848712178, "grad_norm": 0.9081121683120728, "learning_rate": 2e-05, "loss": 0.5828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12124, "tokens_per_second_per_gpu": 9406.17, "total_tokens": 1197157851 }, { "epoch": 0.758002000500125, "grad_norm": 0.9569907784461975, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12125, "tokens_per_second_per_gpu": 13761.2, "total_tokens": 1197254116 }, { "epoch": 0.7580645161290323, "grad_norm": 0.9054418206214905, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12126, "tokens_per_second_per_gpu": 10865.84, "total_tokens": 1197352702 }, { "epoch": 0.7581270317579395, "grad_norm": 0.8913962841033936, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12127, "tokens_per_second_per_gpu": 10947.11, "total_tokens": 1197455777 }, { "epoch": 0.7581895473868467, "grad_norm": 0.9255838990211487, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12128, "tokens_per_second_per_gpu": 10442.35, "total_tokens": 1197554504 }, { "epoch": 0.758252063015754, "grad_norm": 0.8978109955787659, "learning_rate": 2e-05, "loss": 0.6585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12129, "tokens_per_second_per_gpu": 11135.24, "total_tokens": 1197653490 }, { "epoch": 0.7583145786446611, "grad_norm": 0.9281563758850098, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12130, "tokens_per_second_per_gpu": 9663.2, "total_tokens": 1197747637 }, { "epoch": 0.7583770942735684, "grad_norm": 0.860894501209259, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12131, "tokens_per_second_per_gpu": 11012.19, "total_tokens": 1197850529 }, { "epoch": 0.7584396099024756, "grad_norm": 0.915604829788208, "learning_rate": 2e-05, "loss": 0.5801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12132, "tokens_per_second_per_gpu": 10070.13, "total_tokens": 1197945590 }, { "epoch": 0.7585021255313829, "grad_norm": 0.8699408173561096, "learning_rate": 2e-05, "loss": 0.582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12133, "tokens_per_second_per_gpu": 10529.31, "total_tokens": 1198043729 }, { "epoch": 0.7585646411602901, "grad_norm": 0.8853434324264526, "learning_rate": 2e-05, "loss": 0.6019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12134, "tokens_per_second_per_gpu": 10796.04, "total_tokens": 1198144394 }, { "epoch": 0.7586271567891973, "grad_norm": 0.8790895342826843, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12135, "tokens_per_second_per_gpu": 10453.13, "total_tokens": 1198238289 }, { "epoch": 0.7586896724181045, "grad_norm": 0.9206717610359192, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12136, "tokens_per_second_per_gpu": 10391.97, "total_tokens": 1198335944 }, { "epoch": 0.7587521880470117, "grad_norm": 0.8678569197654724, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12137, "tokens_per_second_per_gpu": 10277.77, "total_tokens": 1198438106 }, { "epoch": 0.758814703675919, "grad_norm": 0.9166130423545837, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12138, "tokens_per_second_per_gpu": 10562.91, "total_tokens": 1198539453 }, { "epoch": 0.7588772193048262, "grad_norm": 0.9030202627182007, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12139, "tokens_per_second_per_gpu": 9923.5, "total_tokens": 1198636798 }, { "epoch": 0.7589397349337335, "grad_norm": 0.8710505366325378, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12140, "tokens_per_second_per_gpu": 11039.26, "total_tokens": 1198740751 }, { "epoch": 0.7590022505626407, "grad_norm": 0.8812260031700134, "learning_rate": 2e-05, "loss": 0.565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12141, "tokens_per_second_per_gpu": 10407.43, "total_tokens": 1198835475 }, { "epoch": 0.7590647661915478, "grad_norm": 0.9054120779037476, "learning_rate": 2e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12142, "tokens_per_second_per_gpu": 10032.71, "total_tokens": 1198932507 }, { "epoch": 0.7591272818204551, "grad_norm": 0.875308632850647, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12143, "tokens_per_second_per_gpu": 11009.21, "total_tokens": 1199033873 }, { "epoch": 0.7591897974493623, "grad_norm": 0.9478722810745239, "learning_rate": 2e-05, "loss": 0.6394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12144, "tokens_per_second_per_gpu": 10359.88, "total_tokens": 1199133057 }, { "epoch": 0.7592523130782696, "grad_norm": 0.8820551633834839, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12145, "tokens_per_second_per_gpu": 10217.91, "total_tokens": 1199231200 }, { "epoch": 0.7593148287071768, "grad_norm": 0.8699702620506287, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12146, "tokens_per_second_per_gpu": 10397.84, "total_tokens": 1199328681 }, { "epoch": 0.7593773443360841, "grad_norm": 0.8791559934616089, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12147, "tokens_per_second_per_gpu": 10827.26, "total_tokens": 1199424955 }, { "epoch": 0.7594398599649912, "grad_norm": 0.8681946396827698, "learning_rate": 2e-05, "loss": 0.5956, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12148, "tokens_per_second_per_gpu": 10689.78, "total_tokens": 1199526741 }, { "epoch": 0.7595023755938984, "grad_norm": 0.937826931476593, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12149, "tokens_per_second_per_gpu": 10682.93, "total_tokens": 1199621822 }, { "epoch": 0.7595648912228057, "grad_norm": 0.8546088337898254, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12150, "tokens_per_second_per_gpu": 11010.11, "total_tokens": 1199719450 }, { "epoch": 0.7596274068517129, "grad_norm": 0.8665488362312317, "learning_rate": 2e-05, "loss": 0.5936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12151, "tokens_per_second_per_gpu": 11052.79, "total_tokens": 1199817797 }, { "epoch": 0.7596899224806202, "grad_norm": 0.8961222171783447, "learning_rate": 2e-05, "loss": 0.6075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12152, "tokens_per_second_per_gpu": 10749.04, "total_tokens": 1199915670 }, { "epoch": 0.7597524381095274, "grad_norm": 0.8704767227172852, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12153, "tokens_per_second_per_gpu": 10597.6, "total_tokens": 1200017603 }, { "epoch": 0.7598149537384347, "grad_norm": 0.8459801077842712, "learning_rate": 2e-05, "loss": 0.6014, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12154, "tokens_per_second_per_gpu": 10582.08, "total_tokens": 1200119386 }, { "epoch": 0.7598774693673418, "grad_norm": 0.8867222666740417, "learning_rate": 2e-05, "loss": 0.6045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12155, "tokens_per_second_per_gpu": 10855.27, "total_tokens": 1200219722 }, { "epoch": 0.759939984996249, "grad_norm": 0.8985669612884521, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12156, "tokens_per_second_per_gpu": 11017.68, "total_tokens": 1200319412 }, { "epoch": 0.7600025006251563, "grad_norm": 0.9072273373603821, "learning_rate": 2e-05, "loss": 0.6541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12157, "tokens_per_second_per_gpu": 11098.86, "total_tokens": 1200419821 }, { "epoch": 0.7600650162540635, "grad_norm": 0.8761954307556152, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12158, "tokens_per_second_per_gpu": 10436.97, "total_tokens": 1200515958 }, { "epoch": 0.7601275318829708, "grad_norm": 0.9004926681518555, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12159, "tokens_per_second_per_gpu": 10367.69, "total_tokens": 1200613193 }, { "epoch": 0.760190047511878, "grad_norm": 0.8854594230651855, "learning_rate": 2e-05, "loss": 0.5993, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12160, "tokens_per_second_per_gpu": 10585.12, "total_tokens": 1200707792 }, { "epoch": 0.7602525631407852, "grad_norm": 0.9219900369644165, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12161, "tokens_per_second_per_gpu": 10019.58, "total_tokens": 1200803031 }, { "epoch": 0.7603150787696924, "grad_norm": 0.90118807554245, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12162, "tokens_per_second_per_gpu": 10978.97, "total_tokens": 1200904206 }, { "epoch": 0.7603775943985996, "grad_norm": 0.8823049068450928, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12163, "tokens_per_second_per_gpu": 10313.42, "total_tokens": 1201002556 }, { "epoch": 0.7604401100275069, "grad_norm": 0.9108780026435852, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12164, "tokens_per_second_per_gpu": 9703.83, "total_tokens": 1201099433 }, { "epoch": 0.7605026256564141, "grad_norm": 0.8668473958969116, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12165, "tokens_per_second_per_gpu": 10560.02, "total_tokens": 1201199210 }, { "epoch": 0.7605651412853214, "grad_norm": 0.8760574460029602, "learning_rate": 2e-05, "loss": 0.6555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12166, "tokens_per_second_per_gpu": 10455.25, "total_tokens": 1201297453 }, { "epoch": 0.7606276569142285, "grad_norm": 0.8759745359420776, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12167, "tokens_per_second_per_gpu": 10853.75, "total_tokens": 1201397711 }, { "epoch": 0.7606901725431358, "grad_norm": 0.9336262941360474, "learning_rate": 2e-05, "loss": 0.6712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12168, "tokens_per_second_per_gpu": 10569.88, "total_tokens": 1201496299 }, { "epoch": 0.760752688172043, "grad_norm": 0.8756936192512512, "learning_rate": 2e-05, "loss": 0.6741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12169, "tokens_per_second_per_gpu": 10615.76, "total_tokens": 1201597154 }, { "epoch": 0.7608152038009502, "grad_norm": 0.9082596898078918, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12170, "tokens_per_second_per_gpu": 10576.29, "total_tokens": 1201696529 }, { "epoch": 0.7608777194298575, "grad_norm": 0.9376309514045715, "learning_rate": 2e-05, "loss": 0.6089, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12171, "tokens_per_second_per_gpu": 10928.25, "total_tokens": 1201792626 }, { "epoch": 0.7609402350587647, "grad_norm": 0.8774021863937378, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12172, "tokens_per_second_per_gpu": 10749.17, "total_tokens": 1201895735 }, { "epoch": 0.7610027506876719, "grad_norm": 0.8673587441444397, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12173, "tokens_per_second_per_gpu": 10381.13, "total_tokens": 1201995111 }, { "epoch": 0.7610652663165791, "grad_norm": 0.9008470773696899, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12174, "tokens_per_second_per_gpu": 9981.62, "total_tokens": 1202093907 }, { "epoch": 0.7611277819454864, "grad_norm": 0.9156361818313599, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12175, "tokens_per_second_per_gpu": 10144.81, "total_tokens": 1202185176 }, { "epoch": 0.7611902975743936, "grad_norm": 0.8828928470611572, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12176, "tokens_per_second_per_gpu": 10274.12, "total_tokens": 1202281031 }, { "epoch": 0.7612528132033008, "grad_norm": 0.854557991027832, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12177, "tokens_per_second_per_gpu": 10705.15, "total_tokens": 1202383004 }, { "epoch": 0.7613153288322081, "grad_norm": 0.9151781797409058, "learning_rate": 2e-05, "loss": 0.6693, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12178, "tokens_per_second_per_gpu": 11490.1, "total_tokens": 1202486862 }, { "epoch": 0.7613778444611152, "grad_norm": 0.9517501592636108, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12179, "tokens_per_second_per_gpu": 11112.85, "total_tokens": 1202587154 }, { "epoch": 0.7614403600900225, "grad_norm": 0.9025579690933228, "learning_rate": 2e-05, "loss": 0.6087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12180, "tokens_per_second_per_gpu": 10766.7, "total_tokens": 1202684951 }, { "epoch": 0.7615028757189297, "grad_norm": 0.8876926302909851, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12181, "tokens_per_second_per_gpu": 10858.66, "total_tokens": 1202783350 }, { "epoch": 0.761565391347837, "grad_norm": 0.9071797728538513, "learning_rate": 2e-05, "loss": 0.599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12182, "tokens_per_second_per_gpu": 10939.1, "total_tokens": 1202883405 }, { "epoch": 0.7616279069767442, "grad_norm": 0.8810502290725708, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12183, "tokens_per_second_per_gpu": 10603.88, "total_tokens": 1202985152 }, { "epoch": 0.7616904226056515, "grad_norm": 0.8918325304985046, "learning_rate": 2e-05, "loss": 0.6039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12184, "tokens_per_second_per_gpu": 10471.17, "total_tokens": 1203088493 }, { "epoch": 0.7617529382345586, "grad_norm": 0.8929681777954102, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12185, "tokens_per_second_per_gpu": 11022.26, "total_tokens": 1203186774 }, { "epoch": 0.7618154538634658, "grad_norm": 0.8805145025253296, "learning_rate": 2e-05, "loss": 0.5902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12186, "tokens_per_second_per_gpu": 10565.0, "total_tokens": 1203285816 }, { "epoch": 0.7618779694923731, "grad_norm": 0.8614694476127625, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12187, "tokens_per_second_per_gpu": 10532.96, "total_tokens": 1203383212 }, { "epoch": 0.7619404851212803, "grad_norm": 0.9011185765266418, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12188, "tokens_per_second_per_gpu": 10287.44, "total_tokens": 1203481013 }, { "epoch": 0.7620030007501876, "grad_norm": 0.8972759246826172, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12189, "tokens_per_second_per_gpu": 10455.98, "total_tokens": 1203580570 }, { "epoch": 0.7620655163790948, "grad_norm": 0.8978311419487, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12190, "tokens_per_second_per_gpu": 10322.76, "total_tokens": 1203683483 }, { "epoch": 0.762128032008002, "grad_norm": 0.8929941654205322, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12191, "tokens_per_second_per_gpu": 10237.22, "total_tokens": 1203781544 }, { "epoch": 0.7621905476369092, "grad_norm": 0.8906058669090271, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12192, "tokens_per_second_per_gpu": 10251.58, "total_tokens": 1203878020 }, { "epoch": 0.7622530632658164, "grad_norm": 0.862445592880249, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12193, "tokens_per_second_per_gpu": 10701.7, "total_tokens": 1203979732 }, { "epoch": 0.7623155788947237, "grad_norm": 0.9264921545982361, "learning_rate": 2e-05, "loss": 0.5842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12194, "tokens_per_second_per_gpu": 9189.83, "total_tokens": 1204076619 }, { "epoch": 0.7623780945236309, "grad_norm": 0.9668444991111755, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12195, "tokens_per_second_per_gpu": 9276.82, "total_tokens": 1204170779 }, { "epoch": 0.7624406101525382, "grad_norm": 0.9100406169891357, "learning_rate": 2e-05, "loss": 0.6394, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12196, "tokens_per_second_per_gpu": 10169.4, "total_tokens": 1204267690 }, { "epoch": 0.7625031257814454, "grad_norm": 0.8793810606002808, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12197, "tokens_per_second_per_gpu": 10294.85, "total_tokens": 1204366028 }, { "epoch": 0.7625656414103525, "grad_norm": 0.8806717991828918, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12198, "tokens_per_second_per_gpu": 10833.38, "total_tokens": 1204468494 }, { "epoch": 0.7626281570392598, "grad_norm": 0.9350500702857971, "learning_rate": 2e-05, "loss": 0.5942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12199, "tokens_per_second_per_gpu": 10270.35, "total_tokens": 1204567435 }, { "epoch": 0.762690672668167, "grad_norm": 0.9432070851325989, "learning_rate": 2e-05, "loss": 0.6046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12200, "tokens_per_second_per_gpu": 10356.88, "total_tokens": 1204665506 }, { "epoch": 0.7627531882970743, "grad_norm": 0.8676934242248535, "learning_rate": 2e-05, "loss": 0.5864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12201, "tokens_per_second_per_gpu": 10512.01, "total_tokens": 1204763999 }, { "epoch": 0.7628157039259815, "grad_norm": 0.8638265132904053, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12202, "tokens_per_second_per_gpu": 10393.36, "total_tokens": 1204865012 }, { "epoch": 0.7628782195548888, "grad_norm": 0.8889628052711487, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12203, "tokens_per_second_per_gpu": 10798.93, "total_tokens": 1204960623 }, { "epoch": 0.7629407351837959, "grad_norm": 0.8987707495689392, "learning_rate": 2e-05, "loss": 0.6508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12204, "tokens_per_second_per_gpu": 10911.77, "total_tokens": 1205064449 }, { "epoch": 0.7630032508127031, "grad_norm": 0.9728682041168213, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12205, "tokens_per_second_per_gpu": 9838.72, "total_tokens": 1205162498 }, { "epoch": 0.7630657664416104, "grad_norm": 0.9194291234016418, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12206, "tokens_per_second_per_gpu": 9694.94, "total_tokens": 1205258073 }, { "epoch": 0.7631282820705176, "grad_norm": 0.9128779172897339, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12207, "tokens_per_second_per_gpu": 10516.92, "total_tokens": 1205354610 }, { "epoch": 0.7631907976994249, "grad_norm": 0.9018921852111816, "learning_rate": 2e-05, "loss": 0.6406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12208, "tokens_per_second_per_gpu": 10173.47, "total_tokens": 1205453973 }, { "epoch": 0.7632533133283321, "grad_norm": 0.9002130627632141, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12209, "tokens_per_second_per_gpu": 10448.62, "total_tokens": 1205553095 }, { "epoch": 0.7633158289572393, "grad_norm": 0.8635481595993042, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12210, "tokens_per_second_per_gpu": 9938.54, "total_tokens": 1205655310 }, { "epoch": 0.7633783445861465, "grad_norm": 0.8963693976402283, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12211, "tokens_per_second_per_gpu": 9899.68, "total_tokens": 1205751276 }, { "epoch": 0.7634408602150538, "grad_norm": 0.9222257137298584, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12212, "tokens_per_second_per_gpu": 10472.28, "total_tokens": 1205851594 }, { "epoch": 0.763503375843961, "grad_norm": 0.908502459526062, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12213, "tokens_per_second_per_gpu": 10481.73, "total_tokens": 1205950156 }, { "epoch": 0.7635658914728682, "grad_norm": 0.8656344413757324, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12214, "tokens_per_second_per_gpu": 11462.72, "total_tokens": 1206047892 }, { "epoch": 0.7636284071017755, "grad_norm": 0.8648443222045898, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12215, "tokens_per_second_per_gpu": 10898.02, "total_tokens": 1206149058 }, { "epoch": 0.7636909227306826, "grad_norm": 0.8812958002090454, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12216, "tokens_per_second_per_gpu": 10463.51, "total_tokens": 1206248259 }, { "epoch": 0.7637534383595899, "grad_norm": 0.849105715751648, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12217, "tokens_per_second_per_gpu": 10509.33, "total_tokens": 1206350661 }, { "epoch": 0.7638159539884971, "grad_norm": 0.9230448603630066, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12218, "tokens_per_second_per_gpu": 10025.26, "total_tokens": 1206447628 }, { "epoch": 0.7638784696174044, "grad_norm": 0.9194443225860596, "learning_rate": 2e-05, "loss": 0.6432, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12219, "tokens_per_second_per_gpu": 10419.91, "total_tokens": 1206546315 }, { "epoch": 0.7639409852463116, "grad_norm": 0.8643993139266968, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12220, "tokens_per_second_per_gpu": 9922.93, "total_tokens": 1206644612 }, { "epoch": 0.7640035008752188, "grad_norm": 0.9118166565895081, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12221, "tokens_per_second_per_gpu": 11033.57, "total_tokens": 1206747319 }, { "epoch": 0.764066016504126, "grad_norm": 0.8776858448982239, "learning_rate": 2e-05, "loss": 0.6046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12222, "tokens_per_second_per_gpu": 11188.92, "total_tokens": 1206844732 }, { "epoch": 0.7641285321330332, "grad_norm": 0.8949710130691528, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12223, "tokens_per_second_per_gpu": 10405.93, "total_tokens": 1206943651 }, { "epoch": 0.7641910477619405, "grad_norm": 0.876171886920929, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12224, "tokens_per_second_per_gpu": 10942.2, "total_tokens": 1207047646 }, { "epoch": 0.7642535633908477, "grad_norm": 0.8823802471160889, "learning_rate": 2e-05, "loss": 0.5755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12225, "tokens_per_second_per_gpu": 10641.14, "total_tokens": 1207142369 }, { "epoch": 0.764316079019755, "grad_norm": 0.8799091577529907, "learning_rate": 2e-05, "loss": 0.6783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12226, "tokens_per_second_per_gpu": 9810.39, "total_tokens": 1207242618 }, { "epoch": 0.7643785946486622, "grad_norm": 0.8986950516700745, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12227, "tokens_per_second_per_gpu": 9924.86, "total_tokens": 1207341381 }, { "epoch": 0.7644411102775694, "grad_norm": 0.869317889213562, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12228, "tokens_per_second_per_gpu": 10393.19, "total_tokens": 1207442078 }, { "epoch": 0.7645036259064766, "grad_norm": 0.9150484204292297, "learning_rate": 2e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12229, "tokens_per_second_per_gpu": 10043.08, "total_tokens": 1207537788 }, { "epoch": 0.7645661415353838, "grad_norm": 0.8706417679786682, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12230, "tokens_per_second_per_gpu": 11040.59, "total_tokens": 1207637615 }, { "epoch": 0.7646286571642911, "grad_norm": 0.8935049772262573, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12231, "tokens_per_second_per_gpu": 10968.01, "total_tokens": 1207741251 }, { "epoch": 0.7646911727931983, "grad_norm": 0.9433006048202515, "learning_rate": 2e-05, "loss": 0.6477, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12232, "tokens_per_second_per_gpu": 10616.64, "total_tokens": 1207837806 }, { "epoch": 0.7647536884221056, "grad_norm": 0.8685761094093323, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12233, "tokens_per_second_per_gpu": 10906.61, "total_tokens": 1207939713 }, { "epoch": 0.7648162040510128, "grad_norm": 0.8744692802429199, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12234, "tokens_per_second_per_gpu": 10100.68, "total_tokens": 1208036462 }, { "epoch": 0.7648787196799199, "grad_norm": 0.8956282734870911, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12235, "tokens_per_second_per_gpu": 10429.82, "total_tokens": 1208133063 }, { "epoch": 0.7649412353088272, "grad_norm": 0.9075160622596741, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12236, "tokens_per_second_per_gpu": 10474.96, "total_tokens": 1208234148 }, { "epoch": 0.7650037509377344, "grad_norm": 0.8878451585769653, "learning_rate": 2e-05, "loss": 0.606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12237, "tokens_per_second_per_gpu": 9407.84, "total_tokens": 1208329414 }, { "epoch": 0.7650662665666417, "grad_norm": 0.9105820059776306, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12238, "tokens_per_second_per_gpu": 10882.71, "total_tokens": 1208430091 }, { "epoch": 0.7651287821955489, "grad_norm": 0.8996140360832214, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12239, "tokens_per_second_per_gpu": 10440.42, "total_tokens": 1208525843 }, { "epoch": 0.7651912978244562, "grad_norm": 0.9089987277984619, "learning_rate": 2e-05, "loss": 0.6551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12240, "tokens_per_second_per_gpu": 10966.92, "total_tokens": 1208627691 }, { "epoch": 0.7652538134533633, "grad_norm": 0.8623258471488953, "learning_rate": 2e-05, "loss": 0.6018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12241, "tokens_per_second_per_gpu": 10733.75, "total_tokens": 1208729566 }, { "epoch": 0.7653163290822705, "grad_norm": 0.8900090456008911, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12242, "tokens_per_second_per_gpu": 10019.65, "total_tokens": 1208824403 }, { "epoch": 0.7653788447111778, "grad_norm": 0.8705351948738098, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12243, "tokens_per_second_per_gpu": 9239.16, "total_tokens": 1208918424 }, { "epoch": 0.765441360340085, "grad_norm": 0.9313589334487915, "learning_rate": 2e-05, "loss": 0.65, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12244, "tokens_per_second_per_gpu": 10665.88, "total_tokens": 1209018846 }, { "epoch": 0.7655038759689923, "grad_norm": 0.8664551973342896, "learning_rate": 2e-05, "loss": 0.6052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12245, "tokens_per_second_per_gpu": 10842.22, "total_tokens": 1209116149 }, { "epoch": 0.7655663915978995, "grad_norm": 0.896970808506012, "learning_rate": 2e-05, "loss": 0.5647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12246, "tokens_per_second_per_gpu": 9969.65, "total_tokens": 1209210439 }, { "epoch": 0.7656289072268067, "grad_norm": 0.8814387321472168, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12247, "tokens_per_second_per_gpu": 11160.42, "total_tokens": 1209312973 }, { "epoch": 0.7656914228557139, "grad_norm": 0.8748134970664978, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12248, "tokens_per_second_per_gpu": 9992.07, "total_tokens": 1209407565 }, { "epoch": 0.7657539384846211, "grad_norm": 0.8933886289596558, "learning_rate": 2e-05, "loss": 0.6496, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12249, "tokens_per_second_per_gpu": 10190.96, "total_tokens": 1209503036 }, { "epoch": 0.7658164541135284, "grad_norm": 0.8876616954803467, "learning_rate": 2e-05, "loss": 0.605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12250, "tokens_per_second_per_gpu": 10113.28, "total_tokens": 1209597545 }, { "epoch": 0.7658789697424356, "grad_norm": 0.9196987748146057, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12251, "tokens_per_second_per_gpu": 10373.87, "total_tokens": 1209694260 }, { "epoch": 0.7659414853713429, "grad_norm": 0.8901274800300598, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12252, "tokens_per_second_per_gpu": 10451.49, "total_tokens": 1209792806 }, { "epoch": 0.76600400100025, "grad_norm": 0.8728119134902954, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12253, "tokens_per_second_per_gpu": 10665.46, "total_tokens": 1209892348 }, { "epoch": 0.7660665166291573, "grad_norm": 0.8907226920127869, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12254, "tokens_per_second_per_gpu": 10140.26, "total_tokens": 1209986290 }, { "epoch": 0.7661290322580645, "grad_norm": 0.8396292924880981, "learning_rate": 2e-05, "loss": 0.5694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12255, "tokens_per_second_per_gpu": 10452.83, "total_tokens": 1210087608 }, { "epoch": 0.7661915478869717, "grad_norm": 0.8725312948226929, "learning_rate": 2e-05, "loss": 0.5909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12256, "tokens_per_second_per_gpu": 10550.28, "total_tokens": 1210181755 }, { "epoch": 0.766254063515879, "grad_norm": 0.8861151337623596, "learning_rate": 2e-05, "loss": 0.5917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12257, "tokens_per_second_per_gpu": 9045.8, "total_tokens": 1210276826 }, { "epoch": 0.7663165791447862, "grad_norm": 0.8719905614852905, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12258, "tokens_per_second_per_gpu": 10374.53, "total_tokens": 1210380209 }, { "epoch": 0.7663790947736934, "grad_norm": 0.8866918087005615, "learning_rate": 2e-05, "loss": 0.6156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12259, "tokens_per_second_per_gpu": 10873.4, "total_tokens": 1210481530 }, { "epoch": 0.7664416104026006, "grad_norm": 0.9053143858909607, "learning_rate": 2e-05, "loss": 0.5911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12260, "tokens_per_second_per_gpu": 10619.8, "total_tokens": 1210578011 }, { "epoch": 0.7665041260315079, "grad_norm": 0.8705556988716125, "learning_rate": 2e-05, "loss": 0.5924, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12261, "tokens_per_second_per_gpu": 10381.91, "total_tokens": 1210676293 }, { "epoch": 0.7665666416604151, "grad_norm": 0.9094801545143127, "learning_rate": 2e-05, "loss": 0.6569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12262, "tokens_per_second_per_gpu": 9451.99, "total_tokens": 1210774759 }, { "epoch": 0.7666291572893223, "grad_norm": 0.8992220759391785, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12263, "tokens_per_second_per_gpu": 10199.27, "total_tokens": 1210875021 }, { "epoch": 0.7666916729182296, "grad_norm": 0.8919735550880432, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12264, "tokens_per_second_per_gpu": 10000.87, "total_tokens": 1210971597 }, { "epoch": 0.7667541885471368, "grad_norm": 0.8631247282028198, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12265, "tokens_per_second_per_gpu": 10030.53, "total_tokens": 1211071666 }, { "epoch": 0.766816704176044, "grad_norm": 0.9167527556419373, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12266, "tokens_per_second_per_gpu": 11365.7, "total_tokens": 1211173119 }, { "epoch": 0.7668792198049512, "grad_norm": 0.903307318687439, "learning_rate": 2e-05, "loss": 0.6132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12267, "tokens_per_second_per_gpu": 9943.35, "total_tokens": 1211265807 }, { "epoch": 0.7669417354338585, "grad_norm": 0.9043042659759521, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12268, "tokens_per_second_per_gpu": 10989.36, "total_tokens": 1211365090 }, { "epoch": 0.7670042510627657, "grad_norm": 0.9014443755149841, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12269, "tokens_per_second_per_gpu": 10665.65, "total_tokens": 1211463605 }, { "epoch": 0.767066766691673, "grad_norm": 0.9079703688621521, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12270, "tokens_per_second_per_gpu": 10462.28, "total_tokens": 1211560942 }, { "epoch": 0.7671292823205802, "grad_norm": 0.867658793926239, "learning_rate": 2e-05, "loss": 0.6209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12271, "tokens_per_second_per_gpu": 10657.24, "total_tokens": 1211659381 }, { "epoch": 0.7671917979494873, "grad_norm": 0.858777642250061, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12272, "tokens_per_second_per_gpu": 10968.33, "total_tokens": 1211761910 }, { "epoch": 0.7672543135783946, "grad_norm": 0.8654917478561401, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12273, "tokens_per_second_per_gpu": 11258.33, "total_tokens": 1211863068 }, { "epoch": 0.7673168292073018, "grad_norm": 0.8927552103996277, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12274, "tokens_per_second_per_gpu": 10475.18, "total_tokens": 1211959416 }, { "epoch": 0.7673793448362091, "grad_norm": 0.8942515254020691, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12275, "tokens_per_second_per_gpu": 10772.3, "total_tokens": 1212052523 }, { "epoch": 0.7674418604651163, "grad_norm": 0.8974462747573853, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12276, "tokens_per_second_per_gpu": 11145.23, "total_tokens": 1212152263 }, { "epoch": 0.7675043760940236, "grad_norm": 0.8771584033966064, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12277, "tokens_per_second_per_gpu": 10688.56, "total_tokens": 1212249432 }, { "epoch": 0.7675668917229307, "grad_norm": 0.8568053245544434, "learning_rate": 2e-05, "loss": 0.5385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12278, "tokens_per_second_per_gpu": 10049.99, "total_tokens": 1212344149 }, { "epoch": 0.7676294073518379, "grad_norm": 0.9027978777885437, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12279, "tokens_per_second_per_gpu": 10036.3, "total_tokens": 1212441872 }, { "epoch": 0.7676919229807452, "grad_norm": 0.8555992841720581, "learning_rate": 2e-05, "loss": 0.5909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12280, "tokens_per_second_per_gpu": 11146.91, "total_tokens": 1212544846 }, { "epoch": 0.7677544386096524, "grad_norm": 0.8943524956703186, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12281, "tokens_per_second_per_gpu": 10546.84, "total_tokens": 1212643493 }, { "epoch": 0.7678169542385597, "grad_norm": 0.8979059457778931, "learning_rate": 2e-05, "loss": 0.5668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12282, "tokens_per_second_per_gpu": 9667.04, "total_tokens": 1212739509 }, { "epoch": 0.7678794698674669, "grad_norm": 0.8678910732269287, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12283, "tokens_per_second_per_gpu": 10917.24, "total_tokens": 1212838326 }, { "epoch": 0.767941985496374, "grad_norm": 0.9222181439399719, "learning_rate": 2e-05, "loss": 0.5935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12284, "tokens_per_second_per_gpu": 11416.27, "total_tokens": 1212939334 }, { "epoch": 0.7680045011252813, "grad_norm": 0.9013673067092896, "learning_rate": 2e-05, "loss": 0.5906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12285, "tokens_per_second_per_gpu": 10304.38, "total_tokens": 1213032505 }, { "epoch": 0.7680670167541885, "grad_norm": 0.8811435699462891, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12286, "tokens_per_second_per_gpu": 10886.18, "total_tokens": 1213135203 }, { "epoch": 0.7681295323830958, "grad_norm": 0.9177872538566589, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12287, "tokens_per_second_per_gpu": 10642.85, "total_tokens": 1213232636 }, { "epoch": 0.768192048012003, "grad_norm": 0.8770222067832947, "learning_rate": 2e-05, "loss": 0.611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12288, "tokens_per_second_per_gpu": 10061.21, "total_tokens": 1213328800 }, { "epoch": 0.7682545636409103, "grad_norm": 0.9602672457695007, "learning_rate": 2e-05, "loss": 0.6626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12289, "tokens_per_second_per_gpu": 9806.86, "total_tokens": 1213423675 }, { "epoch": 0.7683170792698174, "grad_norm": 0.8943228721618652, "learning_rate": 2e-05, "loss": 0.656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12290, "tokens_per_second_per_gpu": 9892.94, "total_tokens": 1213521040 }, { "epoch": 0.7683795948987246, "grad_norm": 0.9277011752128601, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12291, "tokens_per_second_per_gpu": 9996.34, "total_tokens": 1213615384 }, { "epoch": 0.7684421105276319, "grad_norm": 0.8540713787078857, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12292, "tokens_per_second_per_gpu": 11262.51, "total_tokens": 1213718117 }, { "epoch": 0.7685046261565391, "grad_norm": 0.8877201080322266, "learning_rate": 2e-05, "loss": 0.5857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12293, "tokens_per_second_per_gpu": 9812.29, "total_tokens": 1213814132 }, { "epoch": 0.7685671417854464, "grad_norm": 0.9392566084861755, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12294, "tokens_per_second_per_gpu": 9884.45, "total_tokens": 1213910218 }, { "epoch": 0.7686296574143536, "grad_norm": 0.8905316591262817, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12295, "tokens_per_second_per_gpu": 10603.31, "total_tokens": 1214010400 }, { "epoch": 0.7686921730432608, "grad_norm": 0.887798011302948, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12296, "tokens_per_second_per_gpu": 10212.89, "total_tokens": 1214106748 }, { "epoch": 0.768754688672168, "grad_norm": 0.8971613049507141, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12297, "tokens_per_second_per_gpu": 10242.18, "total_tokens": 1214207629 }, { "epoch": 0.7688172043010753, "grad_norm": 0.8952729105949402, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12298, "tokens_per_second_per_gpu": 10091.15, "total_tokens": 1214305186 }, { "epoch": 0.7688797199299825, "grad_norm": 0.8718292117118835, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12299, "tokens_per_second_per_gpu": 10651.01, "total_tokens": 1214407516 }, { "epoch": 0.7689422355588897, "grad_norm": 0.8834494948387146, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12300, "tokens_per_second_per_gpu": 10161.58, "total_tokens": 1214504159 }, { "epoch": 0.769004751187797, "grad_norm": 0.865858793258667, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12301, "tokens_per_second_per_gpu": 11246.7, "total_tokens": 1214606854 }, { "epoch": 0.7690672668167042, "grad_norm": 1.0374757051467896, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12302, "tokens_per_second_per_gpu": 10308.12, "total_tokens": 1214704822 }, { "epoch": 0.7691297824456114, "grad_norm": 0.871810257434845, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12303, "tokens_per_second_per_gpu": 9962.76, "total_tokens": 1214804342 }, { "epoch": 0.7691922980745186, "grad_norm": 0.9279683828353882, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12304, "tokens_per_second_per_gpu": 10203.51, "total_tokens": 1214899426 }, { "epoch": 0.7692548137034259, "grad_norm": 0.9112594127655029, "learning_rate": 2e-05, "loss": 0.6687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12305, "tokens_per_second_per_gpu": 10528.97, "total_tokens": 1215002110 }, { "epoch": 0.7693173293323331, "grad_norm": 0.932650625705719, "learning_rate": 2e-05, "loss": 0.5856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12306, "tokens_per_second_per_gpu": 9094.84, "total_tokens": 1215092071 }, { "epoch": 0.7693798449612403, "grad_norm": 0.9579400420188904, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12307, "tokens_per_second_per_gpu": 10192.54, "total_tokens": 1215193429 }, { "epoch": 0.7694423605901476, "grad_norm": 0.9188246726989746, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12308, "tokens_per_second_per_gpu": 9331.03, "total_tokens": 1215284625 }, { "epoch": 0.7695048762190547, "grad_norm": 0.9199602603912354, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12309, "tokens_per_second_per_gpu": 11071.03, "total_tokens": 1215386645 }, { "epoch": 0.769567391847962, "grad_norm": 0.8695295453071594, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12310, "tokens_per_second_per_gpu": 11007.35, "total_tokens": 1215486113 }, { "epoch": 0.7696299074768692, "grad_norm": 0.8639242053031921, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12311, "tokens_per_second_per_gpu": 10121.52, "total_tokens": 1215583567 }, { "epoch": 0.7696924231057765, "grad_norm": 0.8644299507141113, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12312, "tokens_per_second_per_gpu": 9364.01, "total_tokens": 1215680005 }, { "epoch": 0.7697549387346837, "grad_norm": 0.8726176619529724, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12313, "tokens_per_second_per_gpu": 10573.65, "total_tokens": 1215778310 }, { "epoch": 0.769817454363591, "grad_norm": 0.885691225528717, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12314, "tokens_per_second_per_gpu": 11025.14, "total_tokens": 1215878002 }, { "epoch": 0.7698799699924981, "grad_norm": 0.8609756231307983, "learning_rate": 2e-05, "loss": 0.5913, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12315, "tokens_per_second_per_gpu": 10605.12, "total_tokens": 1215976210 }, { "epoch": 0.7699424856214053, "grad_norm": 0.8903659582138062, "learning_rate": 2e-05, "loss": 0.6004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12316, "tokens_per_second_per_gpu": 10623.76, "total_tokens": 1216070924 }, { "epoch": 0.7700050012503126, "grad_norm": 0.9719569087028503, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12317, "tokens_per_second_per_gpu": 10656.13, "total_tokens": 1216167671 }, { "epoch": 0.7700675168792198, "grad_norm": 0.9169960618019104, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12318, "tokens_per_second_per_gpu": 9981.12, "total_tokens": 1216259013 }, { "epoch": 0.7701300325081271, "grad_norm": 0.880888819694519, "learning_rate": 2e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12319, "tokens_per_second_per_gpu": 11017.0, "total_tokens": 1216359500 }, { "epoch": 0.7701925481370343, "grad_norm": 0.9323647022247314, "learning_rate": 2e-05, "loss": 0.594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12320, "tokens_per_second_per_gpu": 9581.45, "total_tokens": 1216454876 }, { "epoch": 0.7702550637659414, "grad_norm": 0.8932697176933289, "learning_rate": 2e-05, "loss": 0.588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12321, "tokens_per_second_per_gpu": 10054.09, "total_tokens": 1216550883 }, { "epoch": 0.7703175793948487, "grad_norm": 0.9099159836769104, "learning_rate": 2e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12322, "tokens_per_second_per_gpu": 10042.66, "total_tokens": 1216644443 }, { "epoch": 0.7703800950237559, "grad_norm": 0.9542551636695862, "learning_rate": 2e-05, "loss": 0.6206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12323, "tokens_per_second_per_gpu": 10427.33, "total_tokens": 1216741579 }, { "epoch": 0.7704426106526632, "grad_norm": 0.9614187479019165, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12324, "tokens_per_second_per_gpu": 9871.76, "total_tokens": 1216836911 }, { "epoch": 0.7705051262815704, "grad_norm": 0.8914777040481567, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12325, "tokens_per_second_per_gpu": 11093.7, "total_tokens": 1216937544 }, { "epoch": 0.7705676419104777, "grad_norm": 0.8927424550056458, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12326, "tokens_per_second_per_gpu": 11395.17, "total_tokens": 1217040315 }, { "epoch": 0.7706301575393848, "grad_norm": 0.8961119651794434, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12327, "tokens_per_second_per_gpu": 10343.82, "total_tokens": 1217140549 }, { "epoch": 0.770692673168292, "grad_norm": 0.8380751609802246, "learning_rate": 2e-05, "loss": 0.566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12328, "tokens_per_second_per_gpu": 10516.95, "total_tokens": 1217237210 }, { "epoch": 0.7707551887971993, "grad_norm": 0.9035447239875793, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12329, "tokens_per_second_per_gpu": 10014.23, "total_tokens": 1217333495 }, { "epoch": 0.7708177044261065, "grad_norm": 0.9112451076507568, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12330, "tokens_per_second_per_gpu": 10837.6, "total_tokens": 1217434591 }, { "epoch": 0.7708802200550138, "grad_norm": 0.9069415926933289, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12331, "tokens_per_second_per_gpu": 10610.87, "total_tokens": 1217529906 }, { "epoch": 0.770942735683921, "grad_norm": 0.9095465540885925, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12332, "tokens_per_second_per_gpu": 10178.42, "total_tokens": 1217628500 }, { "epoch": 0.7710052513128282, "grad_norm": 0.9593501091003418, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12333, "tokens_per_second_per_gpu": 10003.66, "total_tokens": 1217721397 }, { "epoch": 0.7710677669417354, "grad_norm": 0.8941770792007446, "learning_rate": 2e-05, "loss": 0.5783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12334, "tokens_per_second_per_gpu": 10321.44, "total_tokens": 1217818262 }, { "epoch": 0.7711302825706426, "grad_norm": 0.9165018796920776, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12335, "tokens_per_second_per_gpu": 10016.95, "total_tokens": 1217913638 }, { "epoch": 0.7711927981995499, "grad_norm": 0.9139611124992371, "learning_rate": 2e-05, "loss": 0.6499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12336, "tokens_per_second_per_gpu": 10305.32, "total_tokens": 1218013360 }, { "epoch": 0.7712553138284571, "grad_norm": 0.8851239085197449, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12337, "tokens_per_second_per_gpu": 10828.08, "total_tokens": 1218112301 }, { "epoch": 0.7713178294573644, "grad_norm": 0.93797367811203, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12338, "tokens_per_second_per_gpu": 9918.75, "total_tokens": 1218212008 }, { "epoch": 0.7713803450862715, "grad_norm": 0.898669421672821, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12339, "tokens_per_second_per_gpu": 10478.83, "total_tokens": 1218307577 }, { "epoch": 0.7714428607151788, "grad_norm": 0.8554926514625549, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12340, "tokens_per_second_per_gpu": 10721.27, "total_tokens": 1218408240 }, { "epoch": 0.771505376344086, "grad_norm": 0.8820350170135498, "learning_rate": 2e-05, "loss": 0.6226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12341, "tokens_per_second_per_gpu": 10996.31, "total_tokens": 1218508182 }, { "epoch": 0.7715678919729932, "grad_norm": 0.8880570530891418, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12342, "tokens_per_second_per_gpu": 10880.84, "total_tokens": 1218607984 }, { "epoch": 0.7716304076019005, "grad_norm": 0.9120246171951294, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12343, "tokens_per_second_per_gpu": 10190.52, "total_tokens": 1218703745 }, { "epoch": 0.7716929232308077, "grad_norm": 0.9172576069831848, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12344, "tokens_per_second_per_gpu": 10395.33, "total_tokens": 1218802660 }, { "epoch": 0.771755438859715, "grad_norm": 0.8913399577140808, "learning_rate": 2e-05, "loss": 0.6413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12345, "tokens_per_second_per_gpu": 10283.24, "total_tokens": 1218900339 }, { "epoch": 0.7718179544886221, "grad_norm": 0.8337544798851013, "learning_rate": 2e-05, "loss": 0.5942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12346, "tokens_per_second_per_gpu": 10836.62, "total_tokens": 1219001812 }, { "epoch": 0.7718804701175294, "grad_norm": 0.8702872395515442, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12347, "tokens_per_second_per_gpu": 10385.29, "total_tokens": 1219100129 }, { "epoch": 0.7719429857464366, "grad_norm": 0.9090170860290527, "learning_rate": 2e-05, "loss": 0.5886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12348, "tokens_per_second_per_gpu": 10115.78, "total_tokens": 1219197274 }, { "epoch": 0.7720055013753438, "grad_norm": 0.8954448699951172, "learning_rate": 2e-05, "loss": 0.591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12349, "tokens_per_second_per_gpu": 11053.32, "total_tokens": 1219293189 }, { "epoch": 0.7720680170042511, "grad_norm": 0.862928032875061, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12350, "tokens_per_second_per_gpu": 10850.78, "total_tokens": 1219388571 }, { "epoch": 0.7721305326331583, "grad_norm": 0.8733709454536438, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12351, "tokens_per_second_per_gpu": 10640.76, "total_tokens": 1219486915 }, { "epoch": 0.7721930482620655, "grad_norm": 0.9237092137336731, "learning_rate": 2e-05, "loss": 0.6589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12352, "tokens_per_second_per_gpu": 9999.48, "total_tokens": 1219584316 }, { "epoch": 0.7722555638909727, "grad_norm": 0.9369692206382751, "learning_rate": 2e-05, "loss": 0.6649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12353, "tokens_per_second_per_gpu": 10297.95, "total_tokens": 1219680952 }, { "epoch": 0.77231807951988, "grad_norm": 0.9200562834739685, "learning_rate": 2e-05, "loss": 0.5962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12354, "tokens_per_second_per_gpu": 10525.65, "total_tokens": 1219776881 }, { "epoch": 0.7723805951487872, "grad_norm": 0.8754668235778809, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12355, "tokens_per_second_per_gpu": 10612.93, "total_tokens": 1219878093 }, { "epoch": 0.7724431107776945, "grad_norm": 0.894361138343811, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12356, "tokens_per_second_per_gpu": 10470.38, "total_tokens": 1219976724 }, { "epoch": 0.7725056264066017, "grad_norm": 0.906287670135498, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12357, "tokens_per_second_per_gpu": 9727.55, "total_tokens": 1220071761 }, { "epoch": 0.7725681420355088, "grad_norm": 0.9075653553009033, "learning_rate": 2e-05, "loss": 0.5995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12358, "tokens_per_second_per_gpu": 11325.47, "total_tokens": 1220170628 }, { "epoch": 0.7726306576644161, "grad_norm": 0.9090620279312134, "learning_rate": 2e-05, "loss": 0.694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12359, "tokens_per_second_per_gpu": 10580.27, "total_tokens": 1220271343 }, { "epoch": 0.7726931732933233, "grad_norm": 0.9523033499717712, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12360, "tokens_per_second_per_gpu": 10307.89, "total_tokens": 1220366254 }, { "epoch": 0.7727556889222306, "grad_norm": 0.9109486937522888, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12361, "tokens_per_second_per_gpu": 13970.68, "total_tokens": 1220464424 }, { "epoch": 0.7728182045511378, "grad_norm": 0.8985635042190552, "learning_rate": 2e-05, "loss": 0.6325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12362, "tokens_per_second_per_gpu": 10650.68, "total_tokens": 1220558965 }, { "epoch": 0.772880720180045, "grad_norm": 0.8865692019462585, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12363, "tokens_per_second_per_gpu": 10670.84, "total_tokens": 1220658066 }, { "epoch": 0.7729432358089522, "grad_norm": 0.9276281595230103, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12364, "tokens_per_second_per_gpu": 9904.6, "total_tokens": 1220753798 }, { "epoch": 0.7730057514378594, "grad_norm": 0.9802423119544983, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12365, "tokens_per_second_per_gpu": 9971.26, "total_tokens": 1220848645 }, { "epoch": 0.7730682670667667, "grad_norm": 0.9126548767089844, "learning_rate": 2e-05, "loss": 0.6211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12366, "tokens_per_second_per_gpu": 9545.14, "total_tokens": 1220945535 }, { "epoch": 0.7731307826956739, "grad_norm": 0.864551842212677, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12367, "tokens_per_second_per_gpu": 10868.45, "total_tokens": 1221046394 }, { "epoch": 0.7731932983245812, "grad_norm": 0.8990899324417114, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12368, "tokens_per_second_per_gpu": 9864.03, "total_tokens": 1221144265 }, { "epoch": 0.7732558139534884, "grad_norm": 0.8975696563720703, "learning_rate": 2e-05, "loss": 0.635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12369, "tokens_per_second_per_gpu": 10287.47, "total_tokens": 1221241690 }, { "epoch": 0.7733183295823955, "grad_norm": 0.8809595704078674, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12370, "tokens_per_second_per_gpu": 10999.45, "total_tokens": 1221342173 }, { "epoch": 0.7733808452113028, "grad_norm": 0.8932453393936157, "learning_rate": 2e-05, "loss": 0.5942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12371, "tokens_per_second_per_gpu": 10646.66, "total_tokens": 1221438244 }, { "epoch": 0.77344336084021, "grad_norm": 0.9015908241271973, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12372, "tokens_per_second_per_gpu": 10737.65, "total_tokens": 1221535411 }, { "epoch": 0.7735058764691173, "grad_norm": 0.8637934923171997, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12373, "tokens_per_second_per_gpu": 10559.3, "total_tokens": 1221635119 }, { "epoch": 0.7735683920980245, "grad_norm": 0.9171342849731445, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12374, "tokens_per_second_per_gpu": 10652.15, "total_tokens": 1221738408 }, { "epoch": 0.7736309077269318, "grad_norm": 0.9151005148887634, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12375, "tokens_per_second_per_gpu": 10891.46, "total_tokens": 1221840025 }, { "epoch": 0.7736934233558389, "grad_norm": 0.9115880727767944, "learning_rate": 2e-05, "loss": 0.6226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12376, "tokens_per_second_per_gpu": 10299.38, "total_tokens": 1221938105 }, { "epoch": 0.7737559389847461, "grad_norm": 0.8635873794555664, "learning_rate": 2e-05, "loss": 0.6178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12377, "tokens_per_second_per_gpu": 11450.97, "total_tokens": 1222036866 }, { "epoch": 0.7738184546136534, "grad_norm": 0.874393880367279, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12378, "tokens_per_second_per_gpu": 10476.6, "total_tokens": 1222134526 }, { "epoch": 0.7738809702425606, "grad_norm": 0.9414752125740051, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12379, "tokens_per_second_per_gpu": 8951.3, "total_tokens": 1222225628 }, { "epoch": 0.7739434858714679, "grad_norm": 0.8976293206214905, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12380, "tokens_per_second_per_gpu": 11230.67, "total_tokens": 1222326622 }, { "epoch": 0.7740060015003751, "grad_norm": 0.8858530521392822, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12381, "tokens_per_second_per_gpu": 11125.88, "total_tokens": 1222426157 }, { "epoch": 0.7740685171292824, "grad_norm": 0.8720037937164307, "learning_rate": 2e-05, "loss": 0.5887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12382, "tokens_per_second_per_gpu": 10915.33, "total_tokens": 1222526133 }, { "epoch": 0.7741310327581895, "grad_norm": 0.9381666779518127, "learning_rate": 2e-05, "loss": 0.6082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12383, "tokens_per_second_per_gpu": 10215.46, "total_tokens": 1222621428 }, { "epoch": 0.7741935483870968, "grad_norm": 0.9006643891334534, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12384, "tokens_per_second_per_gpu": 10386.94, "total_tokens": 1222721094 }, { "epoch": 0.774256064016004, "grad_norm": 0.944378674030304, "learning_rate": 2e-05, "loss": 0.6216, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12385, "tokens_per_second_per_gpu": 10002.26, "total_tokens": 1222818727 }, { "epoch": 0.7743185796449112, "grad_norm": 0.9047593474388123, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12386, "tokens_per_second_per_gpu": 10345.65, "total_tokens": 1222916633 }, { "epoch": 0.7743810952738185, "grad_norm": 0.8777630925178528, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12387, "tokens_per_second_per_gpu": 10451.97, "total_tokens": 1223013393 }, { "epoch": 0.7744436109027257, "grad_norm": 0.8818095922470093, "learning_rate": 2e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12388, "tokens_per_second_per_gpu": 10457.93, "total_tokens": 1223110974 }, { "epoch": 0.7745061265316329, "grad_norm": 0.9009571671485901, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12389, "tokens_per_second_per_gpu": 10625.33, "total_tokens": 1223209522 }, { "epoch": 0.7745686421605401, "grad_norm": 0.9182834625244141, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12390, "tokens_per_second_per_gpu": 10263.25, "total_tokens": 1223304747 }, { "epoch": 0.7746311577894474, "grad_norm": 0.9226110577583313, "learning_rate": 2e-05, "loss": 0.6406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12391, "tokens_per_second_per_gpu": 10055.96, "total_tokens": 1223399660 }, { "epoch": 0.7746936734183546, "grad_norm": 0.8726171255111694, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12392, "tokens_per_second_per_gpu": 9884.99, "total_tokens": 1223496280 }, { "epoch": 0.7747561890472618, "grad_norm": 0.8712562918663025, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12393, "tokens_per_second_per_gpu": 10087.62, "total_tokens": 1223593819 }, { "epoch": 0.7748187046761691, "grad_norm": 0.9863963723182678, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12394, "tokens_per_second_per_gpu": 10996.33, "total_tokens": 1223694474 }, { "epoch": 0.7748812203050762, "grad_norm": 0.9090054035186768, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12395, "tokens_per_second_per_gpu": 10530.49, "total_tokens": 1223791710 }, { "epoch": 0.7749437359339835, "grad_norm": 0.9216911792755127, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12396, "tokens_per_second_per_gpu": 10410.33, "total_tokens": 1223888734 }, { "epoch": 0.7750062515628907, "grad_norm": 0.9289339184761047, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12397, "tokens_per_second_per_gpu": 10836.34, "total_tokens": 1223987466 }, { "epoch": 0.775068767191798, "grad_norm": 0.8949926495552063, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12398, "tokens_per_second_per_gpu": 10345.33, "total_tokens": 1224084497 }, { "epoch": 0.7751312828207052, "grad_norm": 0.8555625081062317, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12399, "tokens_per_second_per_gpu": 11537.54, "total_tokens": 1224189203 }, { "epoch": 0.7751937984496124, "grad_norm": 0.8992127776145935, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12400, "tokens_per_second_per_gpu": 10646.76, "total_tokens": 1224288745 }, { "epoch": 0.7752563140785196, "grad_norm": 0.8998556733131409, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12401, "tokens_per_second_per_gpu": 10555.55, "total_tokens": 1224386613 }, { "epoch": 0.7753188297074268, "grad_norm": 0.9121404886245728, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12402, "tokens_per_second_per_gpu": 10115.62, "total_tokens": 1224486984 }, { "epoch": 0.7753813453363341, "grad_norm": 0.8580340147018433, "learning_rate": 2e-05, "loss": 0.6538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12403, "tokens_per_second_per_gpu": 10697.24, "total_tokens": 1224588773 }, { "epoch": 0.7754438609652413, "grad_norm": 0.8746473789215088, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12404, "tokens_per_second_per_gpu": 9912.98, "total_tokens": 1224684120 }, { "epoch": 0.7755063765941486, "grad_norm": 0.9241585731506348, "learning_rate": 2e-05, "loss": 0.6052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12405, "tokens_per_second_per_gpu": 10230.24, "total_tokens": 1224779153 }, { "epoch": 0.7755688922230558, "grad_norm": 0.9457208514213562, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12406, "tokens_per_second_per_gpu": 9523.12, "total_tokens": 1224870702 }, { "epoch": 0.7756314078519629, "grad_norm": 0.9244033098220825, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12407, "tokens_per_second_per_gpu": 10618.89, "total_tokens": 1224966670 }, { "epoch": 0.7756939234808702, "grad_norm": 0.9283875823020935, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12408, "tokens_per_second_per_gpu": 10167.66, "total_tokens": 1225062828 }, { "epoch": 0.7757564391097774, "grad_norm": 0.8898163437843323, "learning_rate": 2e-05, "loss": 0.5851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12409, "tokens_per_second_per_gpu": 10047.8, "total_tokens": 1225157668 }, { "epoch": 0.7758189547386847, "grad_norm": 0.9256742000579834, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12410, "tokens_per_second_per_gpu": 10762.09, "total_tokens": 1225250473 }, { "epoch": 0.7758814703675919, "grad_norm": 0.8960968852043152, "learning_rate": 2e-05, "loss": 0.5862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12411, "tokens_per_second_per_gpu": 10003.88, "total_tokens": 1225346127 }, { "epoch": 0.7759439859964992, "grad_norm": 0.8740201592445374, "learning_rate": 2e-05, "loss": 0.6206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12412, "tokens_per_second_per_gpu": 10729.81, "total_tokens": 1225448030 }, { "epoch": 0.7760065016254063, "grad_norm": 0.8651612401008606, "learning_rate": 2e-05, "loss": 0.6699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12413, "tokens_per_second_per_gpu": 10780.92, "total_tokens": 1225550043 }, { "epoch": 0.7760690172543135, "grad_norm": 0.8853360414505005, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12414, "tokens_per_second_per_gpu": 10605.56, "total_tokens": 1225647391 }, { "epoch": 0.7761315328832208, "grad_norm": 0.880530834197998, "learning_rate": 2e-05, "loss": 0.5835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12415, "tokens_per_second_per_gpu": 10722.52, "total_tokens": 1225744362 }, { "epoch": 0.776194048512128, "grad_norm": 0.8981295228004456, "learning_rate": 2e-05, "loss": 0.5964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12416, "tokens_per_second_per_gpu": 10208.45, "total_tokens": 1225840103 }, { "epoch": 0.7762565641410353, "grad_norm": 0.8959431648254395, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12417, "tokens_per_second_per_gpu": 10722.83, "total_tokens": 1225939479 }, { "epoch": 0.7763190797699425, "grad_norm": 0.8551191091537476, "learning_rate": 2e-05, "loss": 0.5926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12418, "tokens_per_second_per_gpu": 10556.07, "total_tokens": 1226040242 }, { "epoch": 0.7763815953988498, "grad_norm": 0.8590339422225952, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12419, "tokens_per_second_per_gpu": 10208.49, "total_tokens": 1226140276 }, { "epoch": 0.7764441110277569, "grad_norm": 0.8860058188438416, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12420, "tokens_per_second_per_gpu": 10752.42, "total_tokens": 1226241401 }, { "epoch": 0.7765066266566641, "grad_norm": 0.8821902871131897, "learning_rate": 2e-05, "loss": 0.5698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12421, "tokens_per_second_per_gpu": 9470.62, "total_tokens": 1226335233 }, { "epoch": 0.7765691422855714, "grad_norm": 0.905474066734314, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12422, "tokens_per_second_per_gpu": 9782.37, "total_tokens": 1226427801 }, { "epoch": 0.7766316579144786, "grad_norm": 0.9006933569908142, "learning_rate": 2e-05, "loss": 0.5818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12423, "tokens_per_second_per_gpu": 9701.56, "total_tokens": 1226520548 }, { "epoch": 0.7766941735433859, "grad_norm": 0.8854614496231079, "learning_rate": 2e-05, "loss": 0.5822, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12424, "tokens_per_second_per_gpu": 10102.26, "total_tokens": 1226617873 }, { "epoch": 0.7767566891722931, "grad_norm": 0.9628580808639526, "learning_rate": 2e-05, "loss": 0.6549, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12425, "tokens_per_second_per_gpu": 10542.86, "total_tokens": 1226717935 }, { "epoch": 0.7768192048012003, "grad_norm": 0.8715569376945496, "learning_rate": 2e-05, "loss": 0.5896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12426, "tokens_per_second_per_gpu": 10775.38, "total_tokens": 1226817044 }, { "epoch": 0.7768817204301075, "grad_norm": 0.8854271173477173, "learning_rate": 2e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12427, "tokens_per_second_per_gpu": 10072.88, "total_tokens": 1226912704 }, { "epoch": 0.7769442360590147, "grad_norm": 0.9062399864196777, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12428, "tokens_per_second_per_gpu": 10627.65, "total_tokens": 1227010171 }, { "epoch": 0.777006751687922, "grad_norm": 0.9798275828361511, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12429, "tokens_per_second_per_gpu": 9644.32, "total_tokens": 1227102088 }, { "epoch": 0.7770692673168292, "grad_norm": 0.970504879951477, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12430, "tokens_per_second_per_gpu": 9804.31, "total_tokens": 1227193465 }, { "epoch": 0.7771317829457365, "grad_norm": 0.8884260654449463, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12431, "tokens_per_second_per_gpu": 10774.09, "total_tokens": 1227297038 }, { "epoch": 0.7771942985746436, "grad_norm": 0.9091261625289917, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12432, "tokens_per_second_per_gpu": 10788.37, "total_tokens": 1227393380 }, { "epoch": 0.7772568142035509, "grad_norm": 0.9034382104873657, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12433, "tokens_per_second_per_gpu": 10509.97, "total_tokens": 1227491531 }, { "epoch": 0.7773193298324581, "grad_norm": 0.9291245341300964, "learning_rate": 2e-05, "loss": 0.6499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12434, "tokens_per_second_per_gpu": 10062.81, "total_tokens": 1227589473 }, { "epoch": 0.7773818454613654, "grad_norm": 0.8584287762641907, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12435, "tokens_per_second_per_gpu": 11083.15, "total_tokens": 1227688882 }, { "epoch": 0.7774443610902726, "grad_norm": 0.8691886067390442, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12436, "tokens_per_second_per_gpu": 10265.39, "total_tokens": 1227785109 }, { "epoch": 0.7775068767191798, "grad_norm": 0.8806112408638, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12437, "tokens_per_second_per_gpu": 9853.55, "total_tokens": 1227883378 }, { "epoch": 0.777569392348087, "grad_norm": 0.8892425298690796, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12438, "tokens_per_second_per_gpu": 10327.59, "total_tokens": 1227977878 }, { "epoch": 0.7776319079769942, "grad_norm": 0.9213895201683044, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12439, "tokens_per_second_per_gpu": 10169.55, "total_tokens": 1228074005 }, { "epoch": 0.7776944236059015, "grad_norm": 0.9337389469146729, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12440, "tokens_per_second_per_gpu": 10837.55, "total_tokens": 1228170790 }, { "epoch": 0.7777569392348087, "grad_norm": 0.9601514339447021, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12441, "tokens_per_second_per_gpu": 10917.04, "total_tokens": 1228267248 }, { "epoch": 0.777819454863716, "grad_norm": 0.909778892993927, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12442, "tokens_per_second_per_gpu": 10257.84, "total_tokens": 1228362564 }, { "epoch": 0.7778819704926232, "grad_norm": 0.9070284366607666, "learning_rate": 2e-05, "loss": 0.6457, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12443, "tokens_per_second_per_gpu": 9412.76, "total_tokens": 1228453431 }, { "epoch": 0.7779444861215303, "grad_norm": 0.8758500814437866, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12444, "tokens_per_second_per_gpu": 9761.25, "total_tokens": 1228549348 }, { "epoch": 0.7780070017504376, "grad_norm": 0.9090096354484558, "learning_rate": 2e-05, "loss": 0.5689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12445, "tokens_per_second_per_gpu": 9537.38, "total_tokens": 1228643623 }, { "epoch": 0.7780695173793448, "grad_norm": 0.8936033844947815, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12446, "tokens_per_second_per_gpu": 9645.96, "total_tokens": 1228734800 }, { "epoch": 0.7781320330082521, "grad_norm": 0.8720347881317139, "learning_rate": 2e-05, "loss": 0.5796, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12447, "tokens_per_second_per_gpu": 10208.1, "total_tokens": 1228830929 }, { "epoch": 0.7781945486371593, "grad_norm": 0.8932313919067383, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12448, "tokens_per_second_per_gpu": 10518.87, "total_tokens": 1228927921 }, { "epoch": 0.7782570642660666, "grad_norm": 0.8978902101516724, "learning_rate": 2e-05, "loss": 0.6435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12449, "tokens_per_second_per_gpu": 10199.38, "total_tokens": 1229026623 }, { "epoch": 0.7783195798949737, "grad_norm": 0.8548625707626343, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12450, "tokens_per_second_per_gpu": 10415.25, "total_tokens": 1229123052 }, { "epoch": 0.7783820955238809, "grad_norm": 0.9073508381843567, "learning_rate": 2e-05, "loss": 0.586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12451, "tokens_per_second_per_gpu": 10232.2, "total_tokens": 1229218644 }, { "epoch": 0.7784446111527882, "grad_norm": 0.9312782883644104, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12452, "tokens_per_second_per_gpu": 10837.29, "total_tokens": 1229319406 }, { "epoch": 0.7785071267816954, "grad_norm": 0.9698391556739807, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12453, "tokens_per_second_per_gpu": 9348.92, "total_tokens": 1229408931 }, { "epoch": 0.7785696424106027, "grad_norm": 0.9531475305557251, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12454, "tokens_per_second_per_gpu": 10399.01, "total_tokens": 1229507709 }, { "epoch": 0.7786321580395099, "grad_norm": 0.8726937770843506, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12455, "tokens_per_second_per_gpu": 10067.85, "total_tokens": 1229604306 }, { "epoch": 0.7786946736684172, "grad_norm": 0.906548261642456, "learning_rate": 2e-05, "loss": 0.5709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12456, "tokens_per_second_per_gpu": 9687.17, "total_tokens": 1229695993 }, { "epoch": 0.7787571892973243, "grad_norm": 0.9190346002578735, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12457, "tokens_per_second_per_gpu": 10734.97, "total_tokens": 1229792175 }, { "epoch": 0.7788197049262315, "grad_norm": 0.9178769588470459, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12458, "tokens_per_second_per_gpu": 10900.06, "total_tokens": 1229889095 }, { "epoch": 0.7788822205551388, "grad_norm": 0.9255550503730774, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12459, "tokens_per_second_per_gpu": 10443.74, "total_tokens": 1229985734 }, { "epoch": 0.778944736184046, "grad_norm": 0.8729568123817444, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12460, "tokens_per_second_per_gpu": 10589.64, "total_tokens": 1230081996 }, { "epoch": 0.7790072518129533, "grad_norm": 0.9299110770225525, "learning_rate": 2e-05, "loss": 0.599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12461, "tokens_per_second_per_gpu": 9873.95, "total_tokens": 1230177499 }, { "epoch": 0.7790697674418605, "grad_norm": 0.9294310212135315, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12462, "tokens_per_second_per_gpu": 10566.97, "total_tokens": 1230273578 }, { "epoch": 0.7791322830707677, "grad_norm": 0.9023364782333374, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12463, "tokens_per_second_per_gpu": 10447.07, "total_tokens": 1230370911 }, { "epoch": 0.7791947986996749, "grad_norm": 0.9176614880561829, "learning_rate": 2e-05, "loss": 0.6668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12464, "tokens_per_second_per_gpu": 11011.72, "total_tokens": 1230468156 }, { "epoch": 0.7792573143285821, "grad_norm": 0.8961222171783447, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12465, "tokens_per_second_per_gpu": 10029.15, "total_tokens": 1230564298 }, { "epoch": 0.7793198299574894, "grad_norm": 0.9067570567131042, "learning_rate": 2e-05, "loss": 0.5909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12466, "tokens_per_second_per_gpu": 10221.7, "total_tokens": 1230660875 }, { "epoch": 0.7793823455863966, "grad_norm": 0.8830237984657288, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12467, "tokens_per_second_per_gpu": 10730.47, "total_tokens": 1230762843 }, { "epoch": 0.7794448612153039, "grad_norm": 0.8914604783058167, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12468, "tokens_per_second_per_gpu": 10104.78, "total_tokens": 1230863246 }, { "epoch": 0.779507376844211, "grad_norm": 0.9821333289146423, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12469, "tokens_per_second_per_gpu": 9562.61, "total_tokens": 1230958528 }, { "epoch": 0.7795698924731183, "grad_norm": 0.8700537085533142, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12470, "tokens_per_second_per_gpu": 11178.09, "total_tokens": 1231059950 }, { "epoch": 0.7796324081020255, "grad_norm": 0.9597340822219849, "learning_rate": 2e-05, "loss": 0.5937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12471, "tokens_per_second_per_gpu": 9526.89, "total_tokens": 1231154427 }, { "epoch": 0.7796949237309327, "grad_norm": 0.8773303627967834, "learning_rate": 2e-05, "loss": 0.6017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12472, "tokens_per_second_per_gpu": 10119.03, "total_tokens": 1231251831 }, { "epoch": 0.77975743935984, "grad_norm": 0.8993616700172424, "learning_rate": 2e-05, "loss": 0.6598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12473, "tokens_per_second_per_gpu": 10198.72, "total_tokens": 1231350662 }, { "epoch": 0.7798199549887472, "grad_norm": 0.8790465593338013, "learning_rate": 2e-05, "loss": 0.6615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12474, "tokens_per_second_per_gpu": 10601.87, "total_tokens": 1231448472 }, { "epoch": 0.7798824706176544, "grad_norm": 0.8842141628265381, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12475, "tokens_per_second_per_gpu": 10662.14, "total_tokens": 1231546008 }, { "epoch": 0.7799449862465616, "grad_norm": 0.8961877226829529, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12476, "tokens_per_second_per_gpu": 10621.05, "total_tokens": 1231646584 }, { "epoch": 0.7800075018754689, "grad_norm": 0.8988856673240662, "learning_rate": 2e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12477, "tokens_per_second_per_gpu": 10545.56, "total_tokens": 1231743799 }, { "epoch": 0.7800700175043761, "grad_norm": 0.9211463332176208, "learning_rate": 2e-05, "loss": 0.6563, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12478, "tokens_per_second_per_gpu": 11648.75, "total_tokens": 1231846154 }, { "epoch": 0.7801325331332833, "grad_norm": 0.8542845845222473, "learning_rate": 2e-05, "loss": 0.5812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12479, "tokens_per_second_per_gpu": 10338.18, "total_tokens": 1231946715 }, { "epoch": 0.7801950487621906, "grad_norm": 0.8686656951904297, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12480, "tokens_per_second_per_gpu": 11118.28, "total_tokens": 1232050600 }, { "epoch": 0.7802575643910977, "grad_norm": 0.8737034797668457, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12481, "tokens_per_second_per_gpu": 10954.15, "total_tokens": 1232151696 }, { "epoch": 0.780320080020005, "grad_norm": 0.8958966732025146, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12482, "tokens_per_second_per_gpu": 10396.31, "total_tokens": 1232249471 }, { "epoch": 0.7803825956489122, "grad_norm": 0.9078812003135681, "learning_rate": 2e-05, "loss": 0.6642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12483, "tokens_per_second_per_gpu": 10484.94, "total_tokens": 1232354888 }, { "epoch": 0.7804451112778195, "grad_norm": 0.9731692671775818, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12484, "tokens_per_second_per_gpu": 10659.21, "total_tokens": 1232457616 }, { "epoch": 0.7805076269067267, "grad_norm": 0.8929979801177979, "learning_rate": 2e-05, "loss": 0.6036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12485, "tokens_per_second_per_gpu": 10684.98, "total_tokens": 1232553949 }, { "epoch": 0.780570142535634, "grad_norm": 0.9246756434440613, "learning_rate": 2e-05, "loss": 0.6435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12486, "tokens_per_second_per_gpu": 11289.64, "total_tokens": 1232657288 }, { "epoch": 0.7806326581645411, "grad_norm": 0.8799424767494202, "learning_rate": 2e-05, "loss": 0.5863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12487, "tokens_per_second_per_gpu": 10733.29, "total_tokens": 1232756157 }, { "epoch": 0.7806951737934483, "grad_norm": 0.855480968952179, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12488, "tokens_per_second_per_gpu": 10765.17, "total_tokens": 1232859427 }, { "epoch": 0.7807576894223556, "grad_norm": 0.8829020857810974, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12489, "tokens_per_second_per_gpu": 10862.53, "total_tokens": 1232958435 }, { "epoch": 0.7808202050512628, "grad_norm": 0.9140163064002991, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12490, "tokens_per_second_per_gpu": 10106.02, "total_tokens": 1233053446 }, { "epoch": 0.7808827206801701, "grad_norm": 0.9264446496963501, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12491, "tokens_per_second_per_gpu": 9997.61, "total_tokens": 1233152219 }, { "epoch": 0.7809452363090773, "grad_norm": 0.8699194192886353, "learning_rate": 2e-05, "loss": 0.6405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12492, "tokens_per_second_per_gpu": 11223.24, "total_tokens": 1233255367 }, { "epoch": 0.7810077519379846, "grad_norm": 0.8975231051445007, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12493, "tokens_per_second_per_gpu": 10204.23, "total_tokens": 1233353925 }, { "epoch": 0.7810702675668917, "grad_norm": 0.8578636646270752, "learning_rate": 2e-05, "loss": 0.5804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12494, "tokens_per_second_per_gpu": 11151.57, "total_tokens": 1233454472 }, { "epoch": 0.7811327831957989, "grad_norm": 0.8835386633872986, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12495, "tokens_per_second_per_gpu": 10133.17, "total_tokens": 1233551225 }, { "epoch": 0.7811952988247062, "grad_norm": 0.8798291087150574, "learning_rate": 2e-05, "loss": 0.589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12496, "tokens_per_second_per_gpu": 11001.37, "total_tokens": 1233649093 }, { "epoch": 0.7812578144536134, "grad_norm": 0.9408121705055237, "learning_rate": 2e-05, "loss": 0.5947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12497, "tokens_per_second_per_gpu": 9400.99, "total_tokens": 1233739869 }, { "epoch": 0.7813203300825207, "grad_norm": 0.8918768763542175, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12498, "tokens_per_second_per_gpu": 10273.31, "total_tokens": 1233838487 }, { "epoch": 0.7813828457114279, "grad_norm": 0.9052453637123108, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12499, "tokens_per_second_per_gpu": 10589.12, "total_tokens": 1233937016 }, { "epoch": 0.781445361340335, "grad_norm": 0.8781528472900391, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12500, "tokens_per_second_per_gpu": 10012.56, "total_tokens": 1234035380 }, { "epoch": 0.7815078769692423, "grad_norm": 0.8903401494026184, "learning_rate": 2e-05, "loss": 0.6044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12501, "tokens_per_second_per_gpu": 10390.98, "total_tokens": 1234134931 }, { "epoch": 0.7815703925981495, "grad_norm": 0.9086765646934509, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12502, "tokens_per_second_per_gpu": 9717.05, "total_tokens": 1234228752 }, { "epoch": 0.7816329082270568, "grad_norm": 0.878544270992279, "learning_rate": 2e-05, "loss": 0.5989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12503, "tokens_per_second_per_gpu": 10325.21, "total_tokens": 1234327224 }, { "epoch": 0.781695423855964, "grad_norm": 0.9405797123908997, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12504, "tokens_per_second_per_gpu": 10903.42, "total_tokens": 1234426817 }, { "epoch": 0.7817579394848713, "grad_norm": 0.9338167905807495, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12505, "tokens_per_second_per_gpu": 11352.58, "total_tokens": 1234527657 }, { "epoch": 0.7818204551137784, "grad_norm": 0.8612998723983765, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12506, "tokens_per_second_per_gpu": 10526.76, "total_tokens": 1234627701 }, { "epoch": 0.7818829707426856, "grad_norm": 0.917005717754364, "learning_rate": 2e-05, "loss": 0.6833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12507, "tokens_per_second_per_gpu": 10310.56, "total_tokens": 1234725700 }, { "epoch": 0.7819454863715929, "grad_norm": 0.9250167608261108, "learning_rate": 2e-05, "loss": 0.5909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12508, "tokens_per_second_per_gpu": 10536.43, "total_tokens": 1234823423 }, { "epoch": 0.7820080020005001, "grad_norm": 0.8973044753074646, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12509, "tokens_per_second_per_gpu": 10496.72, "total_tokens": 1234921426 }, { "epoch": 0.7820705176294074, "grad_norm": 0.8982465863227844, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12510, "tokens_per_second_per_gpu": 9989.99, "total_tokens": 1235021637 }, { "epoch": 0.7821330332583146, "grad_norm": 0.9083921909332275, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12511, "tokens_per_second_per_gpu": 10582.67, "total_tokens": 1235119799 }, { "epoch": 0.7821955488872218, "grad_norm": 0.9248162508010864, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12512, "tokens_per_second_per_gpu": 10337.74, "total_tokens": 1235217299 }, { "epoch": 0.782258064516129, "grad_norm": 0.8808367252349854, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12513, "tokens_per_second_per_gpu": 10719.25, "total_tokens": 1235316692 }, { "epoch": 0.7823205801450362, "grad_norm": 0.8898983597755432, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12514, "tokens_per_second_per_gpu": 10693.23, "total_tokens": 1235417150 }, { "epoch": 0.7823830957739435, "grad_norm": 0.8824555277824402, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12515, "tokens_per_second_per_gpu": 11572.26, "total_tokens": 1235515022 }, { "epoch": 0.7824456114028507, "grad_norm": 0.9262604117393494, "learning_rate": 2e-05, "loss": 0.6517, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12516, "tokens_per_second_per_gpu": 9799.78, "total_tokens": 1235611049 }, { "epoch": 0.782508127031758, "grad_norm": 0.9128891229629517, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12517, "tokens_per_second_per_gpu": 10681.84, "total_tokens": 1235708827 }, { "epoch": 0.7825706426606651, "grad_norm": 0.8337674140930176, "learning_rate": 2e-05, "loss": 0.5728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12518, "tokens_per_second_per_gpu": 9898.23, "total_tokens": 1235807390 }, { "epoch": 0.7826331582895724, "grad_norm": 0.8864964246749878, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12519, "tokens_per_second_per_gpu": 10852.71, "total_tokens": 1235907914 }, { "epoch": 0.7826956739184796, "grad_norm": 0.8640884757041931, "learning_rate": 2e-05, "loss": 0.5843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12520, "tokens_per_second_per_gpu": 9881.01, "total_tokens": 1236006076 }, { "epoch": 0.7827581895473869, "grad_norm": 0.8814975023269653, "learning_rate": 2e-05, "loss": 0.6496, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12521, "tokens_per_second_per_gpu": 10749.15, "total_tokens": 1236104401 }, { "epoch": 0.7828207051762941, "grad_norm": 0.927422821521759, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12522, "tokens_per_second_per_gpu": 9927.82, "total_tokens": 1236198606 }, { "epoch": 0.7828832208052013, "grad_norm": 0.9295581579208374, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12523, "tokens_per_second_per_gpu": 10077.22, "total_tokens": 1236297204 }, { "epoch": 0.7829457364341085, "grad_norm": 0.8938547968864441, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12524, "tokens_per_second_per_gpu": 10733.58, "total_tokens": 1236396625 }, { "epoch": 0.7830082520630157, "grad_norm": 0.8704836368560791, "learning_rate": 2e-05, "loss": 0.6156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12525, "tokens_per_second_per_gpu": 10642.89, "total_tokens": 1236494762 }, { "epoch": 0.783070767691923, "grad_norm": 0.9144406914710999, "learning_rate": 2e-05, "loss": 0.6348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12526, "tokens_per_second_per_gpu": 10572.52, "total_tokens": 1236592929 }, { "epoch": 0.7831332833208302, "grad_norm": 0.9092353582382202, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12527, "tokens_per_second_per_gpu": 10323.96, "total_tokens": 1236691862 }, { "epoch": 0.7831957989497375, "grad_norm": 0.914181113243103, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12528, "tokens_per_second_per_gpu": 10078.35, "total_tokens": 1236790005 }, { "epoch": 0.7832583145786447, "grad_norm": 0.9038707613945007, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12529, "tokens_per_second_per_gpu": 10620.45, "total_tokens": 1236886747 }, { "epoch": 0.7833208302075518, "grad_norm": 0.894774317741394, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12530, "tokens_per_second_per_gpu": 10653.27, "total_tokens": 1236985978 }, { "epoch": 0.7833833458364591, "grad_norm": 0.9067853689193726, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12531, "tokens_per_second_per_gpu": 9543.17, "total_tokens": 1237082698 }, { "epoch": 0.7834458614653663, "grad_norm": 0.8788992762565613, "learning_rate": 2e-05, "loss": 0.6048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12532, "tokens_per_second_per_gpu": 9572.34, "total_tokens": 1237179174 }, { "epoch": 0.7835083770942736, "grad_norm": 0.8815253973007202, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12533, "tokens_per_second_per_gpu": 10719.14, "total_tokens": 1237282189 }, { "epoch": 0.7835708927231808, "grad_norm": 0.8970009088516235, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12534, "tokens_per_second_per_gpu": 10315.37, "total_tokens": 1237379817 }, { "epoch": 0.7836334083520881, "grad_norm": 0.9044984579086304, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12535, "tokens_per_second_per_gpu": 10642.84, "total_tokens": 1237476920 }, { "epoch": 0.7836959239809953, "grad_norm": 0.8652017116546631, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12536, "tokens_per_second_per_gpu": 11163.44, "total_tokens": 1237578454 }, { "epoch": 0.7837584396099024, "grad_norm": 0.8705130219459534, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12537, "tokens_per_second_per_gpu": 10068.09, "total_tokens": 1237677651 }, { "epoch": 0.7838209552388097, "grad_norm": 0.8299655318260193, "learning_rate": 2e-05, "loss": 0.5859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12538, "tokens_per_second_per_gpu": 10885.7, "total_tokens": 1237778214 }, { "epoch": 0.7838834708677169, "grad_norm": 0.8521511554718018, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12539, "tokens_per_second_per_gpu": 10268.02, "total_tokens": 1237881406 }, { "epoch": 0.7839459864966242, "grad_norm": 0.8798140287399292, "learning_rate": 2e-05, "loss": 0.6348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12540, "tokens_per_second_per_gpu": 10664.9, "total_tokens": 1237981348 }, { "epoch": 0.7840085021255314, "grad_norm": 0.8821558356285095, "learning_rate": 2e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12541, "tokens_per_second_per_gpu": 9890.83, "total_tokens": 1238080010 }, { "epoch": 0.7840710177544387, "grad_norm": 0.9023316502571106, "learning_rate": 2e-05, "loss": 0.6365, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12542, "tokens_per_second_per_gpu": 10219.08, "total_tokens": 1238179805 }, { "epoch": 0.7841335333833458, "grad_norm": 0.9101753830909729, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12543, "tokens_per_second_per_gpu": 10737.08, "total_tokens": 1238275771 }, { "epoch": 0.784196049012253, "grad_norm": 0.9201102256774902, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12544, "tokens_per_second_per_gpu": 11496.73, "total_tokens": 1238380635 }, { "epoch": 0.7842585646411603, "grad_norm": 0.870066225528717, "learning_rate": 2e-05, "loss": 0.5887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12545, "tokens_per_second_per_gpu": 10773.92, "total_tokens": 1238476993 }, { "epoch": 0.7843210802700675, "grad_norm": 0.8448747396469116, "learning_rate": 2e-05, "loss": 0.5809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12546, "tokens_per_second_per_gpu": 10479.2, "total_tokens": 1238575654 }, { "epoch": 0.7843835958989748, "grad_norm": 0.8833549618721008, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12547, "tokens_per_second_per_gpu": 10539.6, "total_tokens": 1238673401 }, { "epoch": 0.784446111527882, "grad_norm": 0.8800719380378723, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12548, "tokens_per_second_per_gpu": 10564.6, "total_tokens": 1238771866 }, { "epoch": 0.7845086271567892, "grad_norm": 0.918740451335907, "learning_rate": 2e-05, "loss": 0.6219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12549, "tokens_per_second_per_gpu": 10831.88, "total_tokens": 1238869862 }, { "epoch": 0.7845711427856964, "grad_norm": 0.8760623931884766, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12550, "tokens_per_second_per_gpu": 11111.15, "total_tokens": 1238968789 }, { "epoch": 0.7846336584146036, "grad_norm": 0.8980112075805664, "learning_rate": 2e-05, "loss": 0.6819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12551, "tokens_per_second_per_gpu": 11110.4, "total_tokens": 1239071553 }, { "epoch": 0.7846961740435109, "grad_norm": 0.9099471569061279, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12552, "tokens_per_second_per_gpu": 10623.5, "total_tokens": 1239166095 }, { "epoch": 0.7847586896724181, "grad_norm": 0.8780084848403931, "learning_rate": 2e-05, "loss": 0.574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12553, "tokens_per_second_per_gpu": 10634.05, "total_tokens": 1239265222 }, { "epoch": 0.7848212053013254, "grad_norm": 0.8752497434616089, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12554, "tokens_per_second_per_gpu": 10210.27, "total_tokens": 1239360700 }, { "epoch": 0.7848837209302325, "grad_norm": 0.9037569165229797, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12555, "tokens_per_second_per_gpu": 10474.54, "total_tokens": 1239455661 }, { "epoch": 0.7849462365591398, "grad_norm": 0.9096698760986328, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12556, "tokens_per_second_per_gpu": 10148.18, "total_tokens": 1239552068 }, { "epoch": 0.785008752188047, "grad_norm": 0.8713763952255249, "learning_rate": 2e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12557, "tokens_per_second_per_gpu": 9836.03, "total_tokens": 1239649742 }, { "epoch": 0.7850712678169542, "grad_norm": 0.8662256002426147, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12558, "tokens_per_second_per_gpu": 11812.37, "total_tokens": 1239754546 }, { "epoch": 0.7851337834458615, "grad_norm": 0.884902834892273, "learning_rate": 2e-05, "loss": 0.5865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12559, "tokens_per_second_per_gpu": 9892.51, "total_tokens": 1239852006 }, { "epoch": 0.7851962990747687, "grad_norm": 0.9080816507339478, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12560, "tokens_per_second_per_gpu": 10631.24, "total_tokens": 1239951981 }, { "epoch": 0.7852588147036759, "grad_norm": 0.9427285194396973, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12561, "tokens_per_second_per_gpu": 10509.87, "total_tokens": 1240050931 }, { "epoch": 0.7853213303325831, "grad_norm": 0.8899776339530945, "learning_rate": 2e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12562, "tokens_per_second_per_gpu": 10620.15, "total_tokens": 1240152607 }, { "epoch": 0.7853838459614904, "grad_norm": 0.8622907996177673, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12563, "tokens_per_second_per_gpu": 11119.52, "total_tokens": 1240256030 }, { "epoch": 0.7854463615903976, "grad_norm": 0.8780977129936218, "learning_rate": 2e-05, "loss": 0.5888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12564, "tokens_per_second_per_gpu": 10358.65, "total_tokens": 1240353462 }, { "epoch": 0.7855088772193048, "grad_norm": 0.9183139204978943, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12565, "tokens_per_second_per_gpu": 10713.12, "total_tokens": 1240454012 }, { "epoch": 0.7855713928482121, "grad_norm": 0.8539152145385742, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12566, "tokens_per_second_per_gpu": 10814.48, "total_tokens": 1240555091 }, { "epoch": 0.7856339084771192, "grad_norm": 0.8210345506668091, "learning_rate": 2e-05, "loss": 0.5948, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12567, "tokens_per_second_per_gpu": 10315.44, "total_tokens": 1240657962 }, { "epoch": 0.7856964241060265, "grad_norm": 0.8689981698989868, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12568, "tokens_per_second_per_gpu": 9909.93, "total_tokens": 1240752174 }, { "epoch": 0.7857589397349337, "grad_norm": 0.896583616733551, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12569, "tokens_per_second_per_gpu": 11165.63, "total_tokens": 1240851795 }, { "epoch": 0.785821455363841, "grad_norm": 0.9107217788696289, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12570, "tokens_per_second_per_gpu": 10405.08, "total_tokens": 1240947063 }, { "epoch": 0.7858839709927482, "grad_norm": 0.9048430323600769, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12571, "tokens_per_second_per_gpu": 10902.37, "total_tokens": 1241048207 }, { "epoch": 0.7859464866216554, "grad_norm": 0.8555612564086914, "learning_rate": 2e-05, "loss": 0.5722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12572, "tokens_per_second_per_gpu": 10172.12, "total_tokens": 1241146241 }, { "epoch": 0.7860090022505627, "grad_norm": 0.9164680242538452, "learning_rate": 2e-05, "loss": 0.5939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12573, "tokens_per_second_per_gpu": 10560.12, "total_tokens": 1241240730 }, { "epoch": 0.7860715178794698, "grad_norm": 0.8895472288131714, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12574, "tokens_per_second_per_gpu": 10290.07, "total_tokens": 1241338420 }, { "epoch": 0.7861340335083771, "grad_norm": 0.8566215634346008, "learning_rate": 2e-05, "loss": 0.5974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12575, "tokens_per_second_per_gpu": 10650.67, "total_tokens": 1241440226 }, { "epoch": 0.7861965491372843, "grad_norm": 0.8883996605873108, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12576, "tokens_per_second_per_gpu": 9999.85, "total_tokens": 1241537370 }, { "epoch": 0.7862590647661916, "grad_norm": 0.8948072195053101, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12577, "tokens_per_second_per_gpu": 10311.52, "total_tokens": 1241637561 }, { "epoch": 0.7863215803950988, "grad_norm": 0.8667086958885193, "learning_rate": 2e-05, "loss": 0.6255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12578, "tokens_per_second_per_gpu": 10966.67, "total_tokens": 1241741187 }, { "epoch": 0.786384096024006, "grad_norm": 0.8775007724761963, "learning_rate": 2e-05, "loss": 0.5966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12579, "tokens_per_second_per_gpu": 11195.09, "total_tokens": 1241838531 }, { "epoch": 0.7864466116529132, "grad_norm": 0.8455607295036316, "learning_rate": 2e-05, "loss": 0.6012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12580, "tokens_per_second_per_gpu": 11012.99, "total_tokens": 1241942539 }, { "epoch": 0.7865091272818204, "grad_norm": 0.8719421029090881, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12581, "tokens_per_second_per_gpu": 10844.56, "total_tokens": 1242041762 }, { "epoch": 0.7865716429107277, "grad_norm": 0.8640996813774109, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12582, "tokens_per_second_per_gpu": 9708.67, "total_tokens": 1242142747 }, { "epoch": 0.7866341585396349, "grad_norm": 0.8803614377975464, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12583, "tokens_per_second_per_gpu": 10883.53, "total_tokens": 1242245744 }, { "epoch": 0.7866966741685422, "grad_norm": 0.8496203422546387, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12584, "tokens_per_second_per_gpu": 10822.34, "total_tokens": 1242344513 }, { "epoch": 0.7867591897974494, "grad_norm": 0.9283554553985596, "learning_rate": 2e-05, "loss": 0.6431, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12585, "tokens_per_second_per_gpu": 10536.45, "total_tokens": 1242440868 }, { "epoch": 0.7868217054263565, "grad_norm": 0.8626500964164734, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12586, "tokens_per_second_per_gpu": 10983.02, "total_tokens": 1242540683 }, { "epoch": 0.7868842210552638, "grad_norm": 0.9058024883270264, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12587, "tokens_per_second_per_gpu": 10762.39, "total_tokens": 1242643440 }, { "epoch": 0.786946736684171, "grad_norm": 0.8905667662620544, "learning_rate": 2e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12588, "tokens_per_second_per_gpu": 10840.81, "total_tokens": 1242742445 }, { "epoch": 0.7870092523130783, "grad_norm": 0.8823965191841125, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12589, "tokens_per_second_per_gpu": 11101.55, "total_tokens": 1242843414 }, { "epoch": 0.7870717679419855, "grad_norm": 0.9171225428581238, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12590, "tokens_per_second_per_gpu": 11064.38, "total_tokens": 1242946176 }, { "epoch": 0.7871342835708928, "grad_norm": 0.862614095211029, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12591, "tokens_per_second_per_gpu": 10276.18, "total_tokens": 1243046600 }, { "epoch": 0.7871967991997999, "grad_norm": 0.8916676044464111, "learning_rate": 2e-05, "loss": 0.5868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12592, "tokens_per_second_per_gpu": 9868.69, "total_tokens": 1243142426 }, { "epoch": 0.7872593148287071, "grad_norm": 0.8832539319992065, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12593, "tokens_per_second_per_gpu": 9454.54, "total_tokens": 1243235728 }, { "epoch": 0.7873218304576144, "grad_norm": 0.869717001914978, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12594, "tokens_per_second_per_gpu": 11055.97, "total_tokens": 1243338137 }, { "epoch": 0.7873843460865216, "grad_norm": 0.8645049333572388, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12595, "tokens_per_second_per_gpu": 10772.45, "total_tokens": 1243437628 }, { "epoch": 0.7874468617154289, "grad_norm": 0.9007041454315186, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12596, "tokens_per_second_per_gpu": 11157.17, "total_tokens": 1243539284 }, { "epoch": 0.7875093773443361, "grad_norm": 0.8577466011047363, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12597, "tokens_per_second_per_gpu": 11328.13, "total_tokens": 1243639730 }, { "epoch": 0.7875718929732433, "grad_norm": 0.9005833864212036, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12598, "tokens_per_second_per_gpu": 13123.55, "total_tokens": 1243739308 }, { "epoch": 0.7876344086021505, "grad_norm": 0.8659909963607788, "learning_rate": 2e-05, "loss": 0.5609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12599, "tokens_per_second_per_gpu": 9887.14, "total_tokens": 1243835270 }, { "epoch": 0.7876969242310577, "grad_norm": 0.899283230304718, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12600, "tokens_per_second_per_gpu": 10277.58, "total_tokens": 1243931959 }, { "epoch": 0.787759439859965, "grad_norm": 0.8878579139709473, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12601, "tokens_per_second_per_gpu": 10849.09, "total_tokens": 1244035598 }, { "epoch": 0.7878219554888722, "grad_norm": 0.9596965312957764, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12602, "tokens_per_second_per_gpu": 11255.12, "total_tokens": 1244136678 }, { "epoch": 0.7878844711177795, "grad_norm": 0.8928879499435425, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12603, "tokens_per_second_per_gpu": 10480.55, "total_tokens": 1244234143 }, { "epoch": 0.7879469867466866, "grad_norm": 0.9064252376556396, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12604, "tokens_per_second_per_gpu": 10491.91, "total_tokens": 1244337038 }, { "epoch": 0.7880095023755939, "grad_norm": 0.8842811584472656, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12605, "tokens_per_second_per_gpu": 10344.27, "total_tokens": 1244434474 }, { "epoch": 0.7880720180045011, "grad_norm": 0.9600799679756165, "learning_rate": 2e-05, "loss": 0.6873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12606, "tokens_per_second_per_gpu": 11135.48, "total_tokens": 1244532866 }, { "epoch": 0.7881345336334084, "grad_norm": 0.893531322479248, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12607, "tokens_per_second_per_gpu": 10641.99, "total_tokens": 1244633434 }, { "epoch": 0.7881970492623156, "grad_norm": 0.8774092197418213, "learning_rate": 2e-05, "loss": 0.6652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12608, "tokens_per_second_per_gpu": 10922.95, "total_tokens": 1244733595 }, { "epoch": 0.7882595648912228, "grad_norm": 0.9201245903968811, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12609, "tokens_per_second_per_gpu": 11035.91, "total_tokens": 1244834406 }, { "epoch": 0.7883220805201301, "grad_norm": 0.9142425656318665, "learning_rate": 2e-05, "loss": 0.5973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12610, "tokens_per_second_per_gpu": 10089.9, "total_tokens": 1244932877 }, { "epoch": 0.7883845961490372, "grad_norm": 0.9009000062942505, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12611, "tokens_per_second_per_gpu": 10135.49, "total_tokens": 1245034135 }, { "epoch": 0.7884471117779445, "grad_norm": 0.867132842540741, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12612, "tokens_per_second_per_gpu": 10880.44, "total_tokens": 1245136815 }, { "epoch": 0.7885096274068517, "grad_norm": 0.8872823119163513, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12613, "tokens_per_second_per_gpu": 10252.87, "total_tokens": 1245234958 }, { "epoch": 0.788572143035759, "grad_norm": 0.8946673274040222, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12614, "tokens_per_second_per_gpu": 10497.21, "total_tokens": 1245336789 }, { "epoch": 0.7886346586646662, "grad_norm": 0.8853766918182373, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12615, "tokens_per_second_per_gpu": 10718.0, "total_tokens": 1245438220 }, { "epoch": 0.7886971742935734, "grad_norm": 0.8956097960472107, "learning_rate": 2e-05, "loss": 0.5856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12616, "tokens_per_second_per_gpu": 10287.07, "total_tokens": 1245534667 }, { "epoch": 0.7887596899224806, "grad_norm": 0.8804501295089722, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12617, "tokens_per_second_per_gpu": 10127.74, "total_tokens": 1245634456 }, { "epoch": 0.7888222055513878, "grad_norm": 0.8806105852127075, "learning_rate": 2e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12618, "tokens_per_second_per_gpu": 10048.97, "total_tokens": 1245732931 }, { "epoch": 0.7888847211802951, "grad_norm": 0.8931782841682434, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12619, "tokens_per_second_per_gpu": 10725.51, "total_tokens": 1245834155 }, { "epoch": 0.7889472368092023, "grad_norm": 0.9101800918579102, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12620, "tokens_per_second_per_gpu": 10862.4, "total_tokens": 1245933668 }, { "epoch": 0.7890097524381096, "grad_norm": 0.8891562819480896, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12621, "tokens_per_second_per_gpu": 10520.48, "total_tokens": 1246032063 }, { "epoch": 0.7890722680670168, "grad_norm": 0.8807043433189392, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12622, "tokens_per_second_per_gpu": 9943.26, "total_tokens": 1246128714 }, { "epoch": 0.7891347836959239, "grad_norm": 0.8566949367523193, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12623, "tokens_per_second_per_gpu": 11168.86, "total_tokens": 1246232203 }, { "epoch": 0.7891972993248312, "grad_norm": 0.8439384698867798, "learning_rate": 2e-05, "loss": 0.5825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12624, "tokens_per_second_per_gpu": 10709.2, "total_tokens": 1246330665 }, { "epoch": 0.7892598149537384, "grad_norm": 0.896091639995575, "learning_rate": 2e-05, "loss": 0.6718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12625, "tokens_per_second_per_gpu": 10489.43, "total_tokens": 1246431742 }, { "epoch": 0.7893223305826457, "grad_norm": 0.8945913910865784, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12626, "tokens_per_second_per_gpu": 10664.71, "total_tokens": 1246533599 }, { "epoch": 0.7893848462115529, "grad_norm": 0.8956328630447388, "learning_rate": 2e-05, "loss": 0.582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12627, "tokens_per_second_per_gpu": 10198.45, "total_tokens": 1246625776 }, { "epoch": 0.7894473618404602, "grad_norm": 0.8727272748947144, "learning_rate": 2e-05, "loss": 0.5897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12628, "tokens_per_second_per_gpu": 10945.4, "total_tokens": 1246727261 }, { "epoch": 0.7895098774693673, "grad_norm": 0.9408433437347412, "learning_rate": 2e-05, "loss": 0.6425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12629, "tokens_per_second_per_gpu": 9991.92, "total_tokens": 1246823135 }, { "epoch": 0.7895723930982745, "grad_norm": 0.8852574229240417, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12630, "tokens_per_second_per_gpu": 10180.44, "total_tokens": 1246923320 }, { "epoch": 0.7896349087271818, "grad_norm": 0.8511903285980225, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12631, "tokens_per_second_per_gpu": 11110.54, "total_tokens": 1247027060 }, { "epoch": 0.789697424356089, "grad_norm": 0.8708101511001587, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12632, "tokens_per_second_per_gpu": 10801.66, "total_tokens": 1247126230 }, { "epoch": 0.7897599399849963, "grad_norm": 0.8491990566253662, "learning_rate": 2e-05, "loss": 0.6011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12633, "tokens_per_second_per_gpu": 10399.0, "total_tokens": 1247224468 }, { "epoch": 0.7898224556139035, "grad_norm": 0.889350950717926, "learning_rate": 2e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12634, "tokens_per_second_per_gpu": 10001.32, "total_tokens": 1247320083 }, { "epoch": 0.7898849712428107, "grad_norm": 0.848992109298706, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12635, "tokens_per_second_per_gpu": 10960.7, "total_tokens": 1247423884 }, { "epoch": 0.7899474868717179, "grad_norm": 0.8525676727294922, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12636, "tokens_per_second_per_gpu": 10529.39, "total_tokens": 1247526187 }, { "epoch": 0.7900100025006251, "grad_norm": 0.8904851675033569, "learning_rate": 2e-05, "loss": 0.5827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12637, "tokens_per_second_per_gpu": 10025.29, "total_tokens": 1247621910 }, { "epoch": 0.7900725181295324, "grad_norm": 0.8918890357017517, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12638, "tokens_per_second_per_gpu": 10705.6, "total_tokens": 1247720153 }, { "epoch": 0.7901350337584396, "grad_norm": 0.8729071617126465, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12639, "tokens_per_second_per_gpu": 10544.06, "total_tokens": 1247821108 }, { "epoch": 0.7901975493873469, "grad_norm": 0.8840749263763428, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12640, "tokens_per_second_per_gpu": 10548.19, "total_tokens": 1247915913 }, { "epoch": 0.790260065016254, "grad_norm": 0.8721873760223389, "learning_rate": 2e-05, "loss": 0.593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12641, "tokens_per_second_per_gpu": 10912.62, "total_tokens": 1248013896 }, { "epoch": 0.7903225806451613, "grad_norm": 0.8685094118118286, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12642, "tokens_per_second_per_gpu": 10797.02, "total_tokens": 1248111738 }, { "epoch": 0.7903850962740685, "grad_norm": 0.9332891702651978, "learning_rate": 2e-05, "loss": 0.6535, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12643, "tokens_per_second_per_gpu": 11504.11, "total_tokens": 1248213234 }, { "epoch": 0.7904476119029757, "grad_norm": 0.8906442523002625, "learning_rate": 2e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12644, "tokens_per_second_per_gpu": 10735.63, "total_tokens": 1248312409 }, { "epoch": 0.790510127531883, "grad_norm": 0.9013144373893738, "learning_rate": 2e-05, "loss": 0.6608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12645, "tokens_per_second_per_gpu": 10845.83, "total_tokens": 1248414236 }, { "epoch": 0.7905726431607902, "grad_norm": 0.8937610983848572, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12646, "tokens_per_second_per_gpu": 10794.06, "total_tokens": 1248517080 }, { "epoch": 0.7906351587896975, "grad_norm": 0.9157372117042542, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12647, "tokens_per_second_per_gpu": 10376.95, "total_tokens": 1248612498 }, { "epoch": 0.7906976744186046, "grad_norm": 0.901028037071228, "learning_rate": 2e-05, "loss": 0.5899, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12648, "tokens_per_second_per_gpu": 9599.17, "total_tokens": 1248707435 }, { "epoch": 0.7907601900475119, "grad_norm": 0.8867960572242737, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12649, "tokens_per_second_per_gpu": 10217.64, "total_tokens": 1248803643 }, { "epoch": 0.7908227056764191, "grad_norm": 0.9269009232521057, "learning_rate": 2e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12650, "tokens_per_second_per_gpu": 10195.19, "total_tokens": 1248897262 }, { "epoch": 0.7908852213053263, "grad_norm": 0.855867862701416, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12651, "tokens_per_second_per_gpu": 10477.14, "total_tokens": 1248998152 }, { "epoch": 0.7909477369342336, "grad_norm": 0.8976011276245117, "learning_rate": 2e-05, "loss": 0.5953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12652, "tokens_per_second_per_gpu": 9471.55, "total_tokens": 1249096731 }, { "epoch": 0.7910102525631408, "grad_norm": 0.8984118103981018, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12653, "tokens_per_second_per_gpu": 10822.04, "total_tokens": 1249202896 }, { "epoch": 0.791072768192048, "grad_norm": 0.8797085881233215, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12654, "tokens_per_second_per_gpu": 10292.68, "total_tokens": 1249303480 }, { "epoch": 0.7911352838209552, "grad_norm": 0.8920738697052002, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12655, "tokens_per_second_per_gpu": 11363.63, "total_tokens": 1249404750 }, { "epoch": 0.7911977994498625, "grad_norm": 0.8892822861671448, "learning_rate": 2e-05, "loss": 0.6617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12656, "tokens_per_second_per_gpu": 10507.87, "total_tokens": 1249503339 }, { "epoch": 0.7912603150787697, "grad_norm": 0.8685694932937622, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12657, "tokens_per_second_per_gpu": 10422.32, "total_tokens": 1249604970 }, { "epoch": 0.791322830707677, "grad_norm": 0.8272426724433899, "learning_rate": 2e-05, "loss": 0.593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12658, "tokens_per_second_per_gpu": 10232.21, "total_tokens": 1249706325 }, { "epoch": 0.7913853463365842, "grad_norm": 0.8600966334342957, "learning_rate": 2e-05, "loss": 0.5868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12659, "tokens_per_second_per_gpu": 10697.29, "total_tokens": 1249808520 }, { "epoch": 0.7914478619654913, "grad_norm": 0.9120330810546875, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12660, "tokens_per_second_per_gpu": 10827.76, "total_tokens": 1249909554 }, { "epoch": 0.7915103775943986, "grad_norm": 0.8813464045524597, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12661, "tokens_per_second_per_gpu": 11003.42, "total_tokens": 1250011373 }, { "epoch": 0.7915728932233058, "grad_norm": 0.9182623028755188, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12662, "tokens_per_second_per_gpu": 10713.57, "total_tokens": 1250111972 }, { "epoch": 0.7916354088522131, "grad_norm": 0.9116358757019043, "learning_rate": 2e-05, "loss": 0.6126, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12663, "tokens_per_second_per_gpu": 10736.64, "total_tokens": 1250212717 }, { "epoch": 0.7916979244811203, "grad_norm": 0.9060184955596924, "learning_rate": 2e-05, "loss": 0.6428, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12664, "tokens_per_second_per_gpu": 10542.38, "total_tokens": 1250314353 }, { "epoch": 0.7917604401100276, "grad_norm": 0.8798187971115112, "learning_rate": 2e-05, "loss": 0.6313, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12665, "tokens_per_second_per_gpu": 11101.32, "total_tokens": 1250412435 }, { "epoch": 0.7918229557389347, "grad_norm": 0.8334313631057739, "learning_rate": 2e-05, "loss": 0.572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12666, "tokens_per_second_per_gpu": 11627.31, "total_tokens": 1250515396 }, { "epoch": 0.7918854713678419, "grad_norm": 0.8994548320770264, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12667, "tokens_per_second_per_gpu": 10734.67, "total_tokens": 1250615400 }, { "epoch": 0.7919479869967492, "grad_norm": 0.8667064309120178, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12668, "tokens_per_second_per_gpu": 10453.9, "total_tokens": 1250715757 }, { "epoch": 0.7920105026256564, "grad_norm": 0.8610486388206482, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12669, "tokens_per_second_per_gpu": 10192.89, "total_tokens": 1250813882 }, { "epoch": 0.7920730182545637, "grad_norm": 0.8816791772842407, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12670, "tokens_per_second_per_gpu": 10524.46, "total_tokens": 1250915035 }, { "epoch": 0.7921355338834709, "grad_norm": 0.8962424993515015, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12671, "tokens_per_second_per_gpu": 10271.14, "total_tokens": 1251011614 }, { "epoch": 0.792198049512378, "grad_norm": 0.9128024578094482, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12672, "tokens_per_second_per_gpu": 10155.06, "total_tokens": 1251107366 }, { "epoch": 0.7922605651412853, "grad_norm": 0.8987995982170105, "learning_rate": 2e-05, "loss": 0.6118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12673, "tokens_per_second_per_gpu": 10284.05, "total_tokens": 1251204319 }, { "epoch": 0.7923230807701925, "grad_norm": 0.8756875395774841, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12674, "tokens_per_second_per_gpu": 10334.46, "total_tokens": 1251304152 }, { "epoch": 0.7923855963990998, "grad_norm": 0.8661512136459351, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12675, "tokens_per_second_per_gpu": 10613.07, "total_tokens": 1251406757 }, { "epoch": 0.792448112028007, "grad_norm": 0.9090316295623779, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12676, "tokens_per_second_per_gpu": 11014.63, "total_tokens": 1251508869 }, { "epoch": 0.7925106276569143, "grad_norm": 0.9107828736305237, "learning_rate": 2e-05, "loss": 0.6516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12677, "tokens_per_second_per_gpu": 10515.68, "total_tokens": 1251609826 }, { "epoch": 0.7925731432858214, "grad_norm": 0.8623057007789612, "learning_rate": 2e-05, "loss": 0.5975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12678, "tokens_per_second_per_gpu": 10145.33, "total_tokens": 1251706530 }, { "epoch": 0.7926356589147286, "grad_norm": 0.8938800096511841, "learning_rate": 2e-05, "loss": 0.5727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12679, "tokens_per_second_per_gpu": 9504.64, "total_tokens": 1251799729 }, { "epoch": 0.7926981745436359, "grad_norm": 0.8698322772979736, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12680, "tokens_per_second_per_gpu": 11331.47, "total_tokens": 1251904597 }, { "epoch": 0.7927606901725431, "grad_norm": 0.9026215672492981, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12681, "tokens_per_second_per_gpu": 10362.26, "total_tokens": 1252002848 }, { "epoch": 0.7928232058014504, "grad_norm": 0.8548229336738586, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12682, "tokens_per_second_per_gpu": 11053.2, "total_tokens": 1252102956 }, { "epoch": 0.7928857214303576, "grad_norm": 0.899810791015625, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12683, "tokens_per_second_per_gpu": 10313.58, "total_tokens": 1252199388 }, { "epoch": 0.7929482370592649, "grad_norm": 0.842278778553009, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12684, "tokens_per_second_per_gpu": 11484.96, "total_tokens": 1252306094 }, { "epoch": 0.793010752688172, "grad_norm": 0.8994014859199524, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12685, "tokens_per_second_per_gpu": 10941.45, "total_tokens": 1252406496 }, { "epoch": 0.7930732683170792, "grad_norm": 0.916765034198761, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12686, "tokens_per_second_per_gpu": 10022.69, "total_tokens": 1252503762 }, { "epoch": 0.7931357839459865, "grad_norm": 0.869393527507782, "learning_rate": 2e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12687, "tokens_per_second_per_gpu": 10416.25, "total_tokens": 1252602657 }, { "epoch": 0.7931982995748937, "grad_norm": 0.9033971428871155, "learning_rate": 2e-05, "loss": 0.6522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12688, "tokens_per_second_per_gpu": 10759.92, "total_tokens": 1252702650 }, { "epoch": 0.793260815203801, "grad_norm": 0.8610260486602783, "learning_rate": 2e-05, "loss": 0.5811, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12689, "tokens_per_second_per_gpu": 10411.0, "total_tokens": 1252798891 }, { "epoch": 0.7933233308327082, "grad_norm": 0.85079026222229, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12690, "tokens_per_second_per_gpu": 10109.89, "total_tokens": 1252897761 }, { "epoch": 0.7933858464616154, "grad_norm": 0.9002348184585571, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12691, "tokens_per_second_per_gpu": 9861.26, "total_tokens": 1252995874 }, { "epoch": 0.7934483620905226, "grad_norm": 0.8706014752388, "learning_rate": 2e-05, "loss": 0.5857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12692, "tokens_per_second_per_gpu": 10279.57, "total_tokens": 1253095857 }, { "epoch": 0.7935108777194299, "grad_norm": 0.8913655877113342, "learning_rate": 2e-05, "loss": 0.6013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12693, "tokens_per_second_per_gpu": 9953.22, "total_tokens": 1253192240 }, { "epoch": 0.7935733933483371, "grad_norm": 0.8622874617576599, "learning_rate": 2e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12694, "tokens_per_second_per_gpu": 11130.87, "total_tokens": 1253292196 }, { "epoch": 0.7936359089772443, "grad_norm": 0.9061312675476074, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12695, "tokens_per_second_per_gpu": 9623.03, "total_tokens": 1253387624 }, { "epoch": 0.7936984246061516, "grad_norm": 0.8945703506469727, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12696, "tokens_per_second_per_gpu": 11100.16, "total_tokens": 1253493212 }, { "epoch": 0.7937609402350587, "grad_norm": 0.8794608116149902, "learning_rate": 2e-05, "loss": 0.6683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12697, "tokens_per_second_per_gpu": 10602.11, "total_tokens": 1253592353 }, { "epoch": 0.793823455863966, "grad_norm": 0.8698947429656982, "learning_rate": 2e-05, "loss": 0.6044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12698, "tokens_per_second_per_gpu": 9885.0, "total_tokens": 1253690076 }, { "epoch": 0.7938859714928732, "grad_norm": 0.890542209148407, "learning_rate": 2e-05, "loss": 0.5739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12699, "tokens_per_second_per_gpu": 10168.6, "total_tokens": 1253785942 }, { "epoch": 0.7939484871217805, "grad_norm": 0.8816598653793335, "learning_rate": 2e-05, "loss": 0.5718, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12700, "tokens_per_second_per_gpu": 9737.08, "total_tokens": 1253880970 }, { "epoch": 0.7940110027506877, "grad_norm": 0.9008182287216187, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12701, "tokens_per_second_per_gpu": 10211.49, "total_tokens": 1253981125 }, { "epoch": 0.794073518379595, "grad_norm": 0.9249761700630188, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12702, "tokens_per_second_per_gpu": 10313.82, "total_tokens": 1254078690 }, { "epoch": 0.7941360340085021, "grad_norm": 0.9202727675437927, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12703, "tokens_per_second_per_gpu": 10807.1, "total_tokens": 1254181632 }, { "epoch": 0.7941985496374093, "grad_norm": 0.9280090928077698, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12704, "tokens_per_second_per_gpu": 10879.45, "total_tokens": 1254278984 }, { "epoch": 0.7942610652663166, "grad_norm": 0.8808423280715942, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12705, "tokens_per_second_per_gpu": 10463.22, "total_tokens": 1254376043 }, { "epoch": 0.7943235808952238, "grad_norm": 0.8740032315254211, "learning_rate": 2e-05, "loss": 0.5694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12706, "tokens_per_second_per_gpu": 10925.43, "total_tokens": 1254472427 }, { "epoch": 0.7943860965241311, "grad_norm": 0.8767005801200867, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12707, "tokens_per_second_per_gpu": 10752.16, "total_tokens": 1254576672 }, { "epoch": 0.7944486121530383, "grad_norm": 0.8622219562530518, "learning_rate": 2e-05, "loss": 0.5857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12708, "tokens_per_second_per_gpu": 10535.97, "total_tokens": 1254675479 }, { "epoch": 0.7945111277819454, "grad_norm": 0.8779869079589844, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12709, "tokens_per_second_per_gpu": 11313.99, "total_tokens": 1254779765 }, { "epoch": 0.7945736434108527, "grad_norm": 0.9044080376625061, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12710, "tokens_per_second_per_gpu": 11083.29, "total_tokens": 1254878643 }, { "epoch": 0.7946361590397599, "grad_norm": 0.9211326241493225, "learning_rate": 2e-05, "loss": 0.5883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12711, "tokens_per_second_per_gpu": 10445.94, "total_tokens": 1254973799 }, { "epoch": 0.7946986746686672, "grad_norm": 0.8836677670478821, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12712, "tokens_per_second_per_gpu": 10606.83, "total_tokens": 1255075906 }, { "epoch": 0.7947611902975744, "grad_norm": 0.895078718662262, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12713, "tokens_per_second_per_gpu": 10564.21, "total_tokens": 1255173688 }, { "epoch": 0.7948237059264817, "grad_norm": 0.8913509845733643, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12714, "tokens_per_second_per_gpu": 10087.04, "total_tokens": 1255265369 }, { "epoch": 0.7948862215553888, "grad_norm": 0.9168179631233215, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12715, "tokens_per_second_per_gpu": 10769.38, "total_tokens": 1255365248 }, { "epoch": 0.794948737184296, "grad_norm": 0.9103885889053345, "learning_rate": 2e-05, "loss": 0.6356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12716, "tokens_per_second_per_gpu": 10472.03, "total_tokens": 1255464252 }, { "epoch": 0.7950112528132033, "grad_norm": 0.8719504475593567, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12717, "tokens_per_second_per_gpu": 10802.88, "total_tokens": 1255562460 }, { "epoch": 0.7950737684421105, "grad_norm": 0.8336690068244934, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12718, "tokens_per_second_per_gpu": 10876.63, "total_tokens": 1255665354 }, { "epoch": 0.7951362840710178, "grad_norm": 0.8716872930526733, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12719, "tokens_per_second_per_gpu": 11022.83, "total_tokens": 1255765815 }, { "epoch": 0.795198799699925, "grad_norm": 0.8690244555473328, "learning_rate": 2e-05, "loss": 0.6205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12720, "tokens_per_second_per_gpu": 11882.87, "total_tokens": 1255869010 }, { "epoch": 0.7952613153288323, "grad_norm": 0.8343892097473145, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12721, "tokens_per_second_per_gpu": 10384.2, "total_tokens": 1255968158 }, { "epoch": 0.7953238309577394, "grad_norm": 0.9398530125617981, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12722, "tokens_per_second_per_gpu": 11267.35, "total_tokens": 1256069381 }, { "epoch": 0.7953863465866466, "grad_norm": 0.8945696353912354, "learning_rate": 2e-05, "loss": 0.6066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12723, "tokens_per_second_per_gpu": 10899.56, "total_tokens": 1256168125 }, { "epoch": 0.7954488622155539, "grad_norm": 0.8835975527763367, "learning_rate": 2e-05, "loss": 0.5802, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12724, "tokens_per_second_per_gpu": 10218.19, "total_tokens": 1256268738 }, { "epoch": 0.7955113778444611, "grad_norm": 0.9240714311599731, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12725, "tokens_per_second_per_gpu": 10672.21, "total_tokens": 1256369250 }, { "epoch": 0.7955738934733684, "grad_norm": 0.9023127555847168, "learning_rate": 2e-05, "loss": 0.5972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12726, "tokens_per_second_per_gpu": 10493.84, "total_tokens": 1256464972 }, { "epoch": 0.7956364091022756, "grad_norm": 0.9001582860946655, "learning_rate": 2e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12727, "tokens_per_second_per_gpu": 11183.28, "total_tokens": 1256568984 }, { "epoch": 0.7956989247311828, "grad_norm": 0.9135381579399109, "learning_rate": 2e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12728, "tokens_per_second_per_gpu": 10694.27, "total_tokens": 1256667217 }, { "epoch": 0.79576144036009, "grad_norm": 0.8540703654289246, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12729, "tokens_per_second_per_gpu": 11092.15, "total_tokens": 1256773662 }, { "epoch": 0.7958239559889972, "grad_norm": 0.8990798592567444, "learning_rate": 2e-05, "loss": 0.6016, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12730, "tokens_per_second_per_gpu": 10258.25, "total_tokens": 1256871808 }, { "epoch": 0.7958864716179045, "grad_norm": 0.8914753198623657, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12731, "tokens_per_second_per_gpu": 10992.36, "total_tokens": 1256974643 }, { "epoch": 0.7959489872468117, "grad_norm": 0.9106576442718506, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12732, "tokens_per_second_per_gpu": 10811.63, "total_tokens": 1257071982 }, { "epoch": 0.796011502875719, "grad_norm": 0.9030969738960266, "learning_rate": 2e-05, "loss": 0.5842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12733, "tokens_per_second_per_gpu": 10197.32, "total_tokens": 1257170640 }, { "epoch": 0.7960740185046261, "grad_norm": 0.8867893218994141, "learning_rate": 2e-05, "loss": 0.5923, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12734, "tokens_per_second_per_gpu": 10451.8, "total_tokens": 1257271856 }, { "epoch": 0.7961365341335334, "grad_norm": 0.8963668942451477, "learning_rate": 2e-05, "loss": 0.6331, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12735, "tokens_per_second_per_gpu": 10720.86, "total_tokens": 1257374071 }, { "epoch": 0.7961990497624406, "grad_norm": 0.8704109787940979, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12736, "tokens_per_second_per_gpu": 10427.19, "total_tokens": 1257474447 }, { "epoch": 0.7962615653913478, "grad_norm": 0.8746411204338074, "learning_rate": 2e-05, "loss": 0.555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12737, "tokens_per_second_per_gpu": 9708.3, "total_tokens": 1257570386 }, { "epoch": 0.7963240810202551, "grad_norm": 0.932177722454071, "learning_rate": 2e-05, "loss": 0.575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12738, "tokens_per_second_per_gpu": 10130.3, "total_tokens": 1257665520 }, { "epoch": 0.7963865966491623, "grad_norm": 0.8605626225471497, "learning_rate": 2e-05, "loss": 0.5771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12739, "tokens_per_second_per_gpu": 10730.25, "total_tokens": 1257765901 }, { "epoch": 0.7964491122780695, "grad_norm": 0.8789950013160706, "learning_rate": 2e-05, "loss": 0.6105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12740, "tokens_per_second_per_gpu": 10859.21, "total_tokens": 1257864764 }, { "epoch": 0.7965116279069767, "grad_norm": 0.9017049074172974, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12741, "tokens_per_second_per_gpu": 11658.73, "total_tokens": 1257963254 }, { "epoch": 0.796574143535884, "grad_norm": 0.8715358972549438, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12742, "tokens_per_second_per_gpu": 10743.01, "total_tokens": 1258065020 }, { "epoch": 0.7966366591647912, "grad_norm": 0.9695530533790588, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12743, "tokens_per_second_per_gpu": 10374.45, "total_tokens": 1258160325 }, { "epoch": 0.7966991747936985, "grad_norm": 0.9392255544662476, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12744, "tokens_per_second_per_gpu": 9596.85, "total_tokens": 1258256200 }, { "epoch": 0.7967616904226057, "grad_norm": 0.950674295425415, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12745, "tokens_per_second_per_gpu": 10583.53, "total_tokens": 1258352282 }, { "epoch": 0.7968242060515128, "grad_norm": 0.8823454976081848, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12746, "tokens_per_second_per_gpu": 10382.81, "total_tokens": 1258452554 }, { "epoch": 0.7968867216804201, "grad_norm": 0.9054075479507446, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12747, "tokens_per_second_per_gpu": 10722.07, "total_tokens": 1258555642 }, { "epoch": 0.7969492373093273, "grad_norm": 0.8875075578689575, "learning_rate": 2e-05, "loss": 0.588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12748, "tokens_per_second_per_gpu": 8889.59, "total_tokens": 1258648955 }, { "epoch": 0.7970117529382346, "grad_norm": 0.9079375267028809, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12749, "tokens_per_second_per_gpu": 10684.69, "total_tokens": 1258750667 }, { "epoch": 0.7970742685671418, "grad_norm": 0.9086877703666687, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12750, "tokens_per_second_per_gpu": 10579.21, "total_tokens": 1258849300 }, { "epoch": 0.797136784196049, "grad_norm": 0.9410396218299866, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12751, "tokens_per_second_per_gpu": 10180.4, "total_tokens": 1258946467 }, { "epoch": 0.7971992998249562, "grad_norm": 0.8886657953262329, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12752, "tokens_per_second_per_gpu": 10606.93, "total_tokens": 1259047497 }, { "epoch": 0.7972618154538634, "grad_norm": 0.9072084426879883, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12753, "tokens_per_second_per_gpu": 10285.72, "total_tokens": 1259144045 }, { "epoch": 0.7973243310827707, "grad_norm": 0.9243983030319214, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12754, "tokens_per_second_per_gpu": 9766.58, "total_tokens": 1259242336 }, { "epoch": 0.7973868467116779, "grad_norm": 0.9501469731330872, "learning_rate": 2e-05, "loss": 0.6369, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12755, "tokens_per_second_per_gpu": 11000.24, "total_tokens": 1259343595 }, { "epoch": 0.7974493623405852, "grad_norm": 0.905093252658844, "learning_rate": 2e-05, "loss": 0.5862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12756, "tokens_per_second_per_gpu": 10013.3, "total_tokens": 1259438743 }, { "epoch": 0.7975118779694924, "grad_norm": 0.8948354125022888, "learning_rate": 2e-05, "loss": 0.5834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12757, "tokens_per_second_per_gpu": 10628.73, "total_tokens": 1259534251 }, { "epoch": 0.7975743935983995, "grad_norm": 0.9187562465667725, "learning_rate": 2e-05, "loss": 0.6544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12758, "tokens_per_second_per_gpu": 10216.0, "total_tokens": 1259630876 }, { "epoch": 0.7976369092273068, "grad_norm": 0.8944520950317383, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12759, "tokens_per_second_per_gpu": 11159.88, "total_tokens": 1259733520 }, { "epoch": 0.797699424856214, "grad_norm": 0.8572989106178284, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12760, "tokens_per_second_per_gpu": 10812.4, "total_tokens": 1259835204 }, { "epoch": 0.7977619404851213, "grad_norm": 0.907450795173645, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12761, "tokens_per_second_per_gpu": 10948.58, "total_tokens": 1259931245 }, { "epoch": 0.7978244561140285, "grad_norm": 0.9431495666503906, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12762, "tokens_per_second_per_gpu": 9222.46, "total_tokens": 1260024312 }, { "epoch": 0.7978869717429358, "grad_norm": 0.9007846713066101, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12763, "tokens_per_second_per_gpu": 10566.51, "total_tokens": 1260122887 }, { "epoch": 0.797949487371843, "grad_norm": 0.9022151827812195, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12764, "tokens_per_second_per_gpu": 10344.22, "total_tokens": 1260221551 }, { "epoch": 0.7980120030007501, "grad_norm": 0.887484610080719, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12765, "tokens_per_second_per_gpu": 11166.38, "total_tokens": 1260325816 }, { "epoch": 0.7980745186296574, "grad_norm": 0.8643275499343872, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12766, "tokens_per_second_per_gpu": 11190.01, "total_tokens": 1260425914 }, { "epoch": 0.7981370342585646, "grad_norm": 0.8868942856788635, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12767, "tokens_per_second_per_gpu": 10791.55, "total_tokens": 1260525159 }, { "epoch": 0.7981995498874719, "grad_norm": 0.8915841579437256, "learning_rate": 2e-05, "loss": 0.6572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12768, "tokens_per_second_per_gpu": 10713.81, "total_tokens": 1260625346 }, { "epoch": 0.7982620655163791, "grad_norm": 0.8878340721130371, "learning_rate": 2e-05, "loss": 0.5744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12769, "tokens_per_second_per_gpu": 10781.13, "total_tokens": 1260721360 }, { "epoch": 0.7983245811452864, "grad_norm": 0.9080827832221985, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12770, "tokens_per_second_per_gpu": 9877.12, "total_tokens": 1260817803 }, { "epoch": 0.7983870967741935, "grad_norm": 0.8401397466659546, "learning_rate": 2e-05, "loss": 0.574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12771, "tokens_per_second_per_gpu": 10657.05, "total_tokens": 1260914131 }, { "epoch": 0.7984496124031008, "grad_norm": 0.9437247514724731, "learning_rate": 2e-05, "loss": 0.6708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12772, "tokens_per_second_per_gpu": 10962.9, "total_tokens": 1261014457 }, { "epoch": 0.798512128032008, "grad_norm": 0.864271342754364, "learning_rate": 2e-05, "loss": 0.5641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12773, "tokens_per_second_per_gpu": 11011.13, "total_tokens": 1261115477 }, { "epoch": 0.7985746436609152, "grad_norm": 0.8714104890823364, "learning_rate": 2e-05, "loss": 0.5755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12774, "tokens_per_second_per_gpu": 10157.27, "total_tokens": 1261211911 }, { "epoch": 0.7986371592898225, "grad_norm": 0.9444116950035095, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12775, "tokens_per_second_per_gpu": 11069.21, "total_tokens": 1261313672 }, { "epoch": 0.7986996749187297, "grad_norm": 0.8675410151481628, "learning_rate": 2e-05, "loss": 0.6413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12776, "tokens_per_second_per_gpu": 10662.12, "total_tokens": 1261412685 }, { "epoch": 0.7987621905476369, "grad_norm": 0.8649545907974243, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12777, "tokens_per_second_per_gpu": 10631.91, "total_tokens": 1261517542 }, { "epoch": 0.7988247061765441, "grad_norm": 0.8696640729904175, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12778, "tokens_per_second_per_gpu": 10255.25, "total_tokens": 1261619396 }, { "epoch": 0.7988872218054514, "grad_norm": 0.9138002991676331, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12779, "tokens_per_second_per_gpu": 10420.78, "total_tokens": 1261716508 }, { "epoch": 0.7989497374343586, "grad_norm": 0.8823311924934387, "learning_rate": 2e-05, "loss": 0.6493, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12780, "tokens_per_second_per_gpu": 11126.26, "total_tokens": 1261819277 }, { "epoch": 0.7990122530632658, "grad_norm": 0.88617342710495, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12781, "tokens_per_second_per_gpu": 10965.45, "total_tokens": 1261919841 }, { "epoch": 0.7990747686921731, "grad_norm": 0.9085372686386108, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12782, "tokens_per_second_per_gpu": 9628.19, "total_tokens": 1262011775 }, { "epoch": 0.7991372843210802, "grad_norm": 0.8983914852142334, "learning_rate": 2e-05, "loss": 0.659, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12783, "tokens_per_second_per_gpu": 10712.25, "total_tokens": 1262113952 }, { "epoch": 0.7991997999499875, "grad_norm": 0.901378870010376, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12784, "tokens_per_second_per_gpu": 10971.38, "total_tokens": 1262212699 }, { "epoch": 0.7992623155788947, "grad_norm": 0.895460844039917, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12785, "tokens_per_second_per_gpu": 10298.68, "total_tokens": 1262308873 }, { "epoch": 0.799324831207802, "grad_norm": 0.8522258400917053, "learning_rate": 2e-05, "loss": 0.5979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12786, "tokens_per_second_per_gpu": 11168.75, "total_tokens": 1262411220 }, { "epoch": 0.7993873468367092, "grad_norm": 0.9024431109428406, "learning_rate": 2e-05, "loss": 0.668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12787, "tokens_per_second_per_gpu": 9025.88, "total_tokens": 1262507633 }, { "epoch": 0.7994498624656164, "grad_norm": 0.864618718624115, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12788, "tokens_per_second_per_gpu": 10465.12, "total_tokens": 1262608923 }, { "epoch": 0.7995123780945236, "grad_norm": 0.8848280310630798, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12789, "tokens_per_second_per_gpu": 10140.16, "total_tokens": 1262708904 }, { "epoch": 0.7995748937234308, "grad_norm": 0.9064098596572876, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12790, "tokens_per_second_per_gpu": 10724.25, "total_tokens": 1262808504 }, { "epoch": 0.7996374093523381, "grad_norm": 0.8870677351951599, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12791, "tokens_per_second_per_gpu": 10633.17, "total_tokens": 1262906856 }, { "epoch": 0.7996999249812453, "grad_norm": 0.9352366328239441, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12792, "tokens_per_second_per_gpu": 10109.17, "total_tokens": 1263004507 }, { "epoch": 0.7997624406101526, "grad_norm": 0.8736135363578796, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12793, "tokens_per_second_per_gpu": 9478.09, "total_tokens": 1263101606 }, { "epoch": 0.7998249562390598, "grad_norm": 0.8971607685089111, "learning_rate": 2e-05, "loss": 0.5843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12794, "tokens_per_second_per_gpu": 10068.67, "total_tokens": 1263197626 }, { "epoch": 0.7998874718679669, "grad_norm": 0.8665899038314819, "learning_rate": 2e-05, "loss": 0.6572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12795, "tokens_per_second_per_gpu": 11533.19, "total_tokens": 1263303336 }, { "epoch": 0.7999499874968742, "grad_norm": 0.9138312339782715, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12796, "tokens_per_second_per_gpu": 9950.5, "total_tokens": 1263400283 }, { "epoch": 0.8000125031257814, "grad_norm": 0.8896999359130859, "learning_rate": 2e-05, "loss": 0.6904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12797, "tokens_per_second_per_gpu": 11001.67, "total_tokens": 1263502025 }, { "epoch": 0.8000750187546887, "grad_norm": 0.9223418235778809, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12798, "tokens_per_second_per_gpu": 10168.52, "total_tokens": 1263602197 }, { "epoch": 0.8001375343835959, "grad_norm": 0.8695328235626221, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12799, "tokens_per_second_per_gpu": 11090.79, "total_tokens": 1263702988 }, { "epoch": 0.8002000500125032, "grad_norm": 0.8806321620941162, "learning_rate": 2e-05, "loss": 0.5874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12800, "tokens_per_second_per_gpu": 10291.06, "total_tokens": 1263804059 }, { "epoch": 0.8002625656414104, "grad_norm": 0.8963698744773865, "learning_rate": 2e-05, "loss": 0.6237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12801, "tokens_per_second_per_gpu": 10869.52, "total_tokens": 1263908017 }, { "epoch": 0.8003250812703175, "grad_norm": 0.8890554308891296, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12802, "tokens_per_second_per_gpu": 10964.57, "total_tokens": 1264009642 }, { "epoch": 0.8003875968992248, "grad_norm": 0.8935026526451111, "learning_rate": 2e-05, "loss": 0.6012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12803, "tokens_per_second_per_gpu": 10268.46, "total_tokens": 1264107329 }, { "epoch": 0.800450112528132, "grad_norm": 0.8772532939910889, "learning_rate": 2e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12804, "tokens_per_second_per_gpu": 10751.09, "total_tokens": 1264203444 }, { "epoch": 0.8005126281570393, "grad_norm": 0.9530687928199768, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12805, "tokens_per_second_per_gpu": 10919.65, "total_tokens": 1264301576 }, { "epoch": 0.8005751437859465, "grad_norm": 0.8847585916519165, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12806, "tokens_per_second_per_gpu": 11484.27, "total_tokens": 1264404356 }, { "epoch": 0.8006376594148538, "grad_norm": 0.8832808136940002, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12807, "tokens_per_second_per_gpu": 10477.25, "total_tokens": 1264503922 }, { "epoch": 0.8007001750437609, "grad_norm": 0.8977769017219543, "learning_rate": 2e-05, "loss": 0.6036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12808, "tokens_per_second_per_gpu": 10861.92, "total_tokens": 1264604621 }, { "epoch": 0.8007626906726681, "grad_norm": 0.8943188190460205, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12809, "tokens_per_second_per_gpu": 10784.65, "total_tokens": 1264704472 }, { "epoch": 0.8008252063015754, "grad_norm": 0.8969284296035767, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12810, "tokens_per_second_per_gpu": 10863.24, "total_tokens": 1264809204 }, { "epoch": 0.8008877219304826, "grad_norm": 0.9513566493988037, "learning_rate": 2e-05, "loss": 0.6569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12811, "tokens_per_second_per_gpu": 10752.11, "total_tokens": 1264909320 }, { "epoch": 0.8009502375593899, "grad_norm": 0.9103562831878662, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12812, "tokens_per_second_per_gpu": 10720.1, "total_tokens": 1265008379 }, { "epoch": 0.8010127531882971, "grad_norm": 0.8898134231567383, "learning_rate": 2e-05, "loss": 0.5975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12813, "tokens_per_second_per_gpu": 10468.3, "total_tokens": 1265104411 }, { "epoch": 0.8010752688172043, "grad_norm": 0.8822646737098694, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12814, "tokens_per_second_per_gpu": 10973.92, "total_tokens": 1265204910 }, { "epoch": 0.8011377844461115, "grad_norm": 0.8844586610794067, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12815, "tokens_per_second_per_gpu": 11511.77, "total_tokens": 1265308511 }, { "epoch": 0.8012003000750187, "grad_norm": 0.8776362538337708, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12816, "tokens_per_second_per_gpu": 10602.81, "total_tokens": 1265406530 }, { "epoch": 0.801262815703926, "grad_norm": 0.9192338585853577, "learning_rate": 2e-05, "loss": 0.5881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12817, "tokens_per_second_per_gpu": 10770.73, "total_tokens": 1265504359 }, { "epoch": 0.8013253313328332, "grad_norm": 0.8945571780204773, "learning_rate": 2e-05, "loss": 0.6541, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12818, "tokens_per_second_per_gpu": 11146.47, "total_tokens": 1265605775 }, { "epoch": 0.8013878469617405, "grad_norm": 0.8855087161064148, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12819, "tokens_per_second_per_gpu": 10753.92, "total_tokens": 1265705209 }, { "epoch": 0.8014503625906476, "grad_norm": 0.872915506362915, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12820, "tokens_per_second_per_gpu": 11042.73, "total_tokens": 1265807152 }, { "epoch": 0.8015128782195549, "grad_norm": 0.9021613001823425, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12821, "tokens_per_second_per_gpu": 10385.64, "total_tokens": 1265903332 }, { "epoch": 0.8015753938484621, "grad_norm": 0.9180586934089661, "learning_rate": 2e-05, "loss": 0.5773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12822, "tokens_per_second_per_gpu": 10304.41, "total_tokens": 1265998082 }, { "epoch": 0.8016379094773693, "grad_norm": 0.8819557428359985, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12823, "tokens_per_second_per_gpu": 11022.58, "total_tokens": 1266100329 }, { "epoch": 0.8017004251062766, "grad_norm": 0.8915129899978638, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12824, "tokens_per_second_per_gpu": 10644.99, "total_tokens": 1266197193 }, { "epoch": 0.8017629407351838, "grad_norm": 0.8932814598083496, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12825, "tokens_per_second_per_gpu": 10429.63, "total_tokens": 1266292888 }, { "epoch": 0.801825456364091, "grad_norm": 0.9048425555229187, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12826, "tokens_per_second_per_gpu": 10580.63, "total_tokens": 1266390402 }, { "epoch": 0.8018879719929982, "grad_norm": 0.8823131918907166, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12827, "tokens_per_second_per_gpu": 10803.81, "total_tokens": 1266489953 }, { "epoch": 0.8019504876219055, "grad_norm": 0.883750319480896, "learning_rate": 2e-05, "loss": 0.5792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12828, "tokens_per_second_per_gpu": 11107.58, "total_tokens": 1266586255 }, { "epoch": 0.8020130032508127, "grad_norm": 0.8981868624687195, "learning_rate": 2e-05, "loss": 0.665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12829, "tokens_per_second_per_gpu": 10659.64, "total_tokens": 1266687402 }, { "epoch": 0.80207551887972, "grad_norm": 0.9039249420166016, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12830, "tokens_per_second_per_gpu": 10943.07, "total_tokens": 1266788924 }, { "epoch": 0.8021380345086272, "grad_norm": 0.9176528453826904, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12831, "tokens_per_second_per_gpu": 10083.55, "total_tokens": 1266887368 }, { "epoch": 0.8022005501375343, "grad_norm": 0.8665867447853088, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12832, "tokens_per_second_per_gpu": 10860.34, "total_tokens": 1266987221 }, { "epoch": 0.8022630657664416, "grad_norm": 0.9079553484916687, "learning_rate": 2e-05, "loss": 0.6516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12833, "tokens_per_second_per_gpu": 10894.1, "total_tokens": 1267084556 }, { "epoch": 0.8023255813953488, "grad_norm": 0.8849397301673889, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12834, "tokens_per_second_per_gpu": 12426.27, "total_tokens": 1267180193 }, { "epoch": 0.8023880970242561, "grad_norm": 0.8675379753112793, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12835, "tokens_per_second_per_gpu": 11451.5, "total_tokens": 1267282959 }, { "epoch": 0.8024506126531633, "grad_norm": 0.8763009309768677, "learning_rate": 2e-05, "loss": 0.5837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12836, "tokens_per_second_per_gpu": 10070.07, "total_tokens": 1267381779 }, { "epoch": 0.8025131282820706, "grad_norm": 0.8878234624862671, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12837, "tokens_per_second_per_gpu": 10528.0, "total_tokens": 1267480588 }, { "epoch": 0.8025756439109778, "grad_norm": 0.9169232249259949, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12838, "tokens_per_second_per_gpu": 10669.41, "total_tokens": 1267581377 }, { "epoch": 0.8026381595398849, "grad_norm": 0.8929159045219421, "learning_rate": 2e-05, "loss": 0.5762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12839, "tokens_per_second_per_gpu": 10327.01, "total_tokens": 1267675649 }, { "epoch": 0.8027006751687922, "grad_norm": 0.8679779171943665, "learning_rate": 2e-05, "loss": 0.57, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12840, "tokens_per_second_per_gpu": 9843.03, "total_tokens": 1267770243 }, { "epoch": 0.8027631907976994, "grad_norm": 0.866755485534668, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12841, "tokens_per_second_per_gpu": 10579.43, "total_tokens": 1267873143 }, { "epoch": 0.8028257064266067, "grad_norm": 0.9403992891311646, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12842, "tokens_per_second_per_gpu": 10953.4, "total_tokens": 1267972433 }, { "epoch": 0.8028882220555139, "grad_norm": 0.8554205894470215, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12843, "tokens_per_second_per_gpu": 10430.58, "total_tokens": 1268071550 }, { "epoch": 0.8029507376844212, "grad_norm": 0.8903199434280396, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12844, "tokens_per_second_per_gpu": 11051.54, "total_tokens": 1268174179 }, { "epoch": 0.8030132533133283, "grad_norm": 0.8698702454566956, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12845, "tokens_per_second_per_gpu": 11244.16, "total_tokens": 1268278591 }, { "epoch": 0.8030757689422355, "grad_norm": 0.9017547965049744, "learning_rate": 2e-05, "loss": 0.6244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12846, "tokens_per_second_per_gpu": 10485.8, "total_tokens": 1268376629 }, { "epoch": 0.8031382845711428, "grad_norm": 0.8992354273796082, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12847, "tokens_per_second_per_gpu": 11247.42, "total_tokens": 1268478015 }, { "epoch": 0.80320080020005, "grad_norm": 0.8928443193435669, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12848, "tokens_per_second_per_gpu": 10874.94, "total_tokens": 1268578290 }, { "epoch": 0.8032633158289573, "grad_norm": 0.9012300968170166, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12849, "tokens_per_second_per_gpu": 10316.0, "total_tokens": 1268676957 }, { "epoch": 0.8033258314578645, "grad_norm": 0.9140139222145081, "learning_rate": 2e-05, "loss": 0.6572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12850, "tokens_per_second_per_gpu": 9930.23, "total_tokens": 1268775136 }, { "epoch": 0.8033883470867716, "grad_norm": 0.8942232728004456, "learning_rate": 2e-05, "loss": 0.5878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12851, "tokens_per_second_per_gpu": 9731.14, "total_tokens": 1268867773 }, { "epoch": 0.8034508627156789, "grad_norm": 0.8885786533355713, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12852, "tokens_per_second_per_gpu": 10471.84, "total_tokens": 1268968169 }, { "epoch": 0.8035133783445861, "grad_norm": 0.8774971961975098, "learning_rate": 2e-05, "loss": 0.5882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12853, "tokens_per_second_per_gpu": 10934.19, "total_tokens": 1269070604 }, { "epoch": 0.8035758939734934, "grad_norm": 0.8547618389129639, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12854, "tokens_per_second_per_gpu": 11142.03, "total_tokens": 1269173310 }, { "epoch": 0.8036384096024006, "grad_norm": 0.8829687833786011, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12855, "tokens_per_second_per_gpu": 10548.93, "total_tokens": 1269275051 }, { "epoch": 0.8037009252313079, "grad_norm": 0.9046392440795898, "learning_rate": 2e-05, "loss": 0.6423, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12856, "tokens_per_second_per_gpu": 9722.77, "total_tokens": 1269370633 }, { "epoch": 0.803763440860215, "grad_norm": 0.8544191718101501, "learning_rate": 2e-05, "loss": 0.6503, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12857, "tokens_per_second_per_gpu": 10541.36, "total_tokens": 1269473422 }, { "epoch": 0.8038259564891223, "grad_norm": 0.8686819672584534, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12858, "tokens_per_second_per_gpu": 10844.8, "total_tokens": 1269576856 }, { "epoch": 0.8038884721180295, "grad_norm": 0.8820943236351013, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12859, "tokens_per_second_per_gpu": 11159.54, "total_tokens": 1269678415 }, { "epoch": 0.8039509877469367, "grad_norm": 0.8943895697593689, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12860, "tokens_per_second_per_gpu": 10677.03, "total_tokens": 1269773256 }, { "epoch": 0.804013503375844, "grad_norm": 0.883606493473053, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12861, "tokens_per_second_per_gpu": 11018.76, "total_tokens": 1269873762 }, { "epoch": 0.8040760190047512, "grad_norm": 0.8763582706451416, "learning_rate": 2e-05, "loss": 0.6618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12862, "tokens_per_second_per_gpu": 11104.64, "total_tokens": 1269977386 }, { "epoch": 0.8041385346336584, "grad_norm": 0.93198561668396, "learning_rate": 2e-05, "loss": 0.6626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12863, "tokens_per_second_per_gpu": 10181.6, "total_tokens": 1270076385 }, { "epoch": 0.8042010502625656, "grad_norm": 0.8791041374206543, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12864, "tokens_per_second_per_gpu": 11591.67, "total_tokens": 1270173762 }, { "epoch": 0.8042635658914729, "grad_norm": 0.8551649451255798, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12865, "tokens_per_second_per_gpu": 11081.02, "total_tokens": 1270273278 }, { "epoch": 0.8043260815203801, "grad_norm": 0.8953256011009216, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12866, "tokens_per_second_per_gpu": 11007.88, "total_tokens": 1270376533 }, { "epoch": 0.8043885971492873, "grad_norm": 0.8748937249183655, "learning_rate": 2e-05, "loss": 0.6886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12867, "tokens_per_second_per_gpu": 10696.35, "total_tokens": 1270477601 }, { "epoch": 0.8044511127781946, "grad_norm": 0.8516857028007507, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12868, "tokens_per_second_per_gpu": 10926.3, "total_tokens": 1270580942 }, { "epoch": 0.8045136284071017, "grad_norm": 0.8784983158111572, "learning_rate": 2e-05, "loss": 0.5881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12869, "tokens_per_second_per_gpu": 10534.03, "total_tokens": 1270676628 }, { "epoch": 0.804576144036009, "grad_norm": 0.9060110449790955, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12870, "tokens_per_second_per_gpu": 9595.5, "total_tokens": 1270768007 }, { "epoch": 0.8046386596649162, "grad_norm": 0.9358770251274109, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12871, "tokens_per_second_per_gpu": 9894.07, "total_tokens": 1270859121 }, { "epoch": 0.8047011752938235, "grad_norm": 0.8984710574150085, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12872, "tokens_per_second_per_gpu": 11261.94, "total_tokens": 1270962908 }, { "epoch": 0.8047636909227307, "grad_norm": 0.9033672213554382, "learning_rate": 2e-05, "loss": 0.6364, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12873, "tokens_per_second_per_gpu": 11115.12, "total_tokens": 1271065016 }, { "epoch": 0.804826206551638, "grad_norm": 0.8693908452987671, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12874, "tokens_per_second_per_gpu": 11606.11, "total_tokens": 1271168815 }, { "epoch": 0.8048887221805452, "grad_norm": 0.8719748258590698, "learning_rate": 2e-05, "loss": 0.5864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12875, "tokens_per_second_per_gpu": 9959.72, "total_tokens": 1271264672 }, { "epoch": 0.8049512378094523, "grad_norm": 0.9073182344436646, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12876, "tokens_per_second_per_gpu": 9599.95, "total_tokens": 1271359728 }, { "epoch": 0.8050137534383596, "grad_norm": 0.9000474810600281, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12877, "tokens_per_second_per_gpu": 10322.29, "total_tokens": 1271457300 }, { "epoch": 0.8050762690672668, "grad_norm": 0.8690941333770752, "learning_rate": 2e-05, "loss": 0.5773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12878, "tokens_per_second_per_gpu": 10466.69, "total_tokens": 1271550553 }, { "epoch": 0.8051387846961741, "grad_norm": 0.8685513734817505, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12879, "tokens_per_second_per_gpu": 10272.7, "total_tokens": 1271648548 }, { "epoch": 0.8052013003250813, "grad_norm": 0.880044162273407, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12880, "tokens_per_second_per_gpu": 10671.42, "total_tokens": 1271748066 }, { "epoch": 0.8052638159539885, "grad_norm": 0.8933972716331482, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12881, "tokens_per_second_per_gpu": 10472.65, "total_tokens": 1271846121 }, { "epoch": 0.8053263315828957, "grad_norm": 0.879741907119751, "learning_rate": 2e-05, "loss": 0.5963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12882, "tokens_per_second_per_gpu": 10373.74, "total_tokens": 1271942084 }, { "epoch": 0.8053888472118029, "grad_norm": 0.9313714504241943, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12883, "tokens_per_second_per_gpu": 10162.4, "total_tokens": 1272035224 }, { "epoch": 0.8054513628407102, "grad_norm": 0.8838342428207397, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12884, "tokens_per_second_per_gpu": 10065.86, "total_tokens": 1272132341 }, { "epoch": 0.8055138784696174, "grad_norm": 0.8914661407470703, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12885, "tokens_per_second_per_gpu": 10223.46, "total_tokens": 1272227854 }, { "epoch": 0.8055763940985247, "grad_norm": 0.8828141689300537, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12886, "tokens_per_second_per_gpu": 10453.03, "total_tokens": 1272325299 }, { "epoch": 0.8056389097274319, "grad_norm": 0.9016745686531067, "learning_rate": 2e-05, "loss": 0.5979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12887, "tokens_per_second_per_gpu": 10685.56, "total_tokens": 1272423099 }, { "epoch": 0.805701425356339, "grad_norm": 0.9052243828773499, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12888, "tokens_per_second_per_gpu": 10343.87, "total_tokens": 1272523308 }, { "epoch": 0.8057639409852463, "grad_norm": 0.8965408802032471, "learning_rate": 2e-05, "loss": 0.6972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12889, "tokens_per_second_per_gpu": 11019.92, "total_tokens": 1272628805 }, { "epoch": 0.8058264566141535, "grad_norm": 0.9194414615631104, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12890, "tokens_per_second_per_gpu": 10213.56, "total_tokens": 1272729841 }, { "epoch": 0.8058889722430608, "grad_norm": 0.8442424535751343, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12891, "tokens_per_second_per_gpu": 10683.05, "total_tokens": 1272828564 }, { "epoch": 0.805951487871968, "grad_norm": 0.9158919453620911, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12892, "tokens_per_second_per_gpu": 9022.29, "total_tokens": 1272920663 }, { "epoch": 0.8060140035008753, "grad_norm": 0.8997184634208679, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12893, "tokens_per_second_per_gpu": 10174.69, "total_tokens": 1273015212 }, { "epoch": 0.8060765191297824, "grad_norm": 0.883002519607544, "learning_rate": 2e-05, "loss": 0.5676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12894, "tokens_per_second_per_gpu": 10906.0, "total_tokens": 1273111057 }, { "epoch": 0.8061390347586896, "grad_norm": 0.8601428866386414, "learning_rate": 2e-05, "loss": 0.6205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12895, "tokens_per_second_per_gpu": 10876.64, "total_tokens": 1273210793 }, { "epoch": 0.8062015503875969, "grad_norm": 0.8961443901062012, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12896, "tokens_per_second_per_gpu": 10903.25, "total_tokens": 1273309351 }, { "epoch": 0.8062640660165041, "grad_norm": 0.8970110416412354, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12897, "tokens_per_second_per_gpu": 10352.6, "total_tokens": 1273406303 }, { "epoch": 0.8063265816454114, "grad_norm": 0.8732602000236511, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12898, "tokens_per_second_per_gpu": 11309.2, "total_tokens": 1273509221 }, { "epoch": 0.8063890972743186, "grad_norm": 0.8525297045707703, "learning_rate": 2e-05, "loss": 0.5851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12899, "tokens_per_second_per_gpu": 10627.27, "total_tokens": 1273609413 }, { "epoch": 0.8064516129032258, "grad_norm": 0.8760159015655518, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12900, "tokens_per_second_per_gpu": 10474.21, "total_tokens": 1273711631 }, { "epoch": 0.806514128532133, "grad_norm": 0.8859637975692749, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12901, "tokens_per_second_per_gpu": 10643.12, "total_tokens": 1273811301 }, { "epoch": 0.8065766441610402, "grad_norm": 0.9029484987258911, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12902, "tokens_per_second_per_gpu": 10340.52, "total_tokens": 1273909027 }, { "epoch": 0.8066391597899475, "grad_norm": 0.9560083150863647, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12903, "tokens_per_second_per_gpu": 10845.9, "total_tokens": 1274010922 }, { "epoch": 0.8067016754188547, "grad_norm": 0.9009312987327576, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12904, "tokens_per_second_per_gpu": 10669.5, "total_tokens": 1274111057 }, { "epoch": 0.806764191047762, "grad_norm": 0.9212546944618225, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12905, "tokens_per_second_per_gpu": 9740.35, "total_tokens": 1274205590 }, { "epoch": 0.8068267066766691, "grad_norm": 0.8867778778076172, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12906, "tokens_per_second_per_gpu": 9866.17, "total_tokens": 1274299366 }, { "epoch": 0.8068892223055764, "grad_norm": 0.8648465275764465, "learning_rate": 2e-05, "loss": 0.6075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12907, "tokens_per_second_per_gpu": 10425.43, "total_tokens": 1274399508 }, { "epoch": 0.8069517379344836, "grad_norm": 0.9219222664833069, "learning_rate": 2e-05, "loss": 0.6034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12908, "tokens_per_second_per_gpu": 10876.14, "total_tokens": 1274496637 }, { "epoch": 0.8070142535633908, "grad_norm": 0.882165253162384, "learning_rate": 2e-05, "loss": 0.5744, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12909, "tokens_per_second_per_gpu": 9596.25, "total_tokens": 1274587200 }, { "epoch": 0.8070767691922981, "grad_norm": 0.8694128394126892, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12910, "tokens_per_second_per_gpu": 10937.14, "total_tokens": 1274686557 }, { "epoch": 0.8071392848212053, "grad_norm": 0.8568481206893921, "learning_rate": 2e-05, "loss": 0.5875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12911, "tokens_per_second_per_gpu": 10465.86, "total_tokens": 1274787588 }, { "epoch": 0.8072018004501126, "grad_norm": 0.8759718537330627, "learning_rate": 2e-05, "loss": 0.6132, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12912, "tokens_per_second_per_gpu": 10921.08, "total_tokens": 1274891020 }, { "epoch": 0.8072643160790197, "grad_norm": 0.8811723589897156, "learning_rate": 2e-05, "loss": 0.6413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12913, "tokens_per_second_per_gpu": 10948.35, "total_tokens": 1274993796 }, { "epoch": 0.807326831707927, "grad_norm": 0.8465383052825928, "learning_rate": 2e-05, "loss": 0.6022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12914, "tokens_per_second_per_gpu": 10587.39, "total_tokens": 1275093294 }, { "epoch": 0.8073893473368342, "grad_norm": 0.8971978425979614, "learning_rate": 2e-05, "loss": 0.5898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12915, "tokens_per_second_per_gpu": 10229.41, "total_tokens": 1275191874 }, { "epoch": 0.8074518629657415, "grad_norm": 0.9198029041290283, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12916, "tokens_per_second_per_gpu": 10413.18, "total_tokens": 1275289723 }, { "epoch": 0.8075143785946487, "grad_norm": 0.9213365912437439, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12917, "tokens_per_second_per_gpu": 10467.85, "total_tokens": 1275388452 }, { "epoch": 0.8075768942235559, "grad_norm": 0.9022735357284546, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12918, "tokens_per_second_per_gpu": 10396.17, "total_tokens": 1275487044 }, { "epoch": 0.8076394098524631, "grad_norm": 0.8820986151695251, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12919, "tokens_per_second_per_gpu": 10592.45, "total_tokens": 1275586371 }, { "epoch": 0.8077019254813703, "grad_norm": 0.9004083275794983, "learning_rate": 2e-05, "loss": 0.6052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12920, "tokens_per_second_per_gpu": 10475.78, "total_tokens": 1275685853 }, { "epoch": 0.8077644411102776, "grad_norm": 0.8978664875030518, "learning_rate": 2e-05, "loss": 0.5979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12921, "tokens_per_second_per_gpu": 9827.51, "total_tokens": 1275780762 }, { "epoch": 0.8078269567391848, "grad_norm": 0.9099427461624146, "learning_rate": 2e-05, "loss": 0.6377, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12922, "tokens_per_second_per_gpu": 11183.27, "total_tokens": 1275884454 }, { "epoch": 0.807889472368092, "grad_norm": 0.8771103024482727, "learning_rate": 2e-05, "loss": 0.5926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12923, "tokens_per_second_per_gpu": 9703.51, "total_tokens": 1275977218 }, { "epoch": 0.8079519879969993, "grad_norm": 0.8758507966995239, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12924, "tokens_per_second_per_gpu": 10360.94, "total_tokens": 1276077484 }, { "epoch": 0.8080145036259064, "grad_norm": 0.8822025656700134, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12925, "tokens_per_second_per_gpu": 10922.75, "total_tokens": 1276176770 }, { "epoch": 0.8080770192548137, "grad_norm": 0.8835980296134949, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12926, "tokens_per_second_per_gpu": 10313.1, "total_tokens": 1276275656 }, { "epoch": 0.8081395348837209, "grad_norm": 0.9915001392364502, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12927, "tokens_per_second_per_gpu": 11283.65, "total_tokens": 1276378165 }, { "epoch": 0.8082020505126282, "grad_norm": 0.8483503460884094, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12928, "tokens_per_second_per_gpu": 10266.29, "total_tokens": 1276477855 }, { "epoch": 0.8082645661415354, "grad_norm": 0.892455518245697, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12929, "tokens_per_second_per_gpu": 10682.56, "total_tokens": 1276581365 }, { "epoch": 0.8083270817704427, "grad_norm": 0.8888086080551147, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12930, "tokens_per_second_per_gpu": 10192.99, "total_tokens": 1276678722 }, { "epoch": 0.8083895973993498, "grad_norm": 0.8937942385673523, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12931, "tokens_per_second_per_gpu": 9909.73, "total_tokens": 1276778191 }, { "epoch": 0.808452113028257, "grad_norm": 1.0259143114089966, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12932, "tokens_per_second_per_gpu": 10210.87, "total_tokens": 1276876915 }, { "epoch": 0.8085146286571643, "grad_norm": 0.8818376660346985, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12933, "tokens_per_second_per_gpu": 10049.2, "total_tokens": 1276974748 }, { "epoch": 0.8085771442860715, "grad_norm": 0.8955162167549133, "learning_rate": 2e-05, "loss": 0.6787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12934, "tokens_per_second_per_gpu": 10408.29, "total_tokens": 1277077135 }, { "epoch": 0.8086396599149788, "grad_norm": 0.9235848784446716, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12935, "tokens_per_second_per_gpu": 10943.95, "total_tokens": 1277181363 }, { "epoch": 0.808702175543886, "grad_norm": 0.8992481827735901, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12936, "tokens_per_second_per_gpu": 10721.0, "total_tokens": 1277281972 }, { "epoch": 0.8087646911727931, "grad_norm": 0.8609777688980103, "learning_rate": 2e-05, "loss": 0.579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12937, "tokens_per_second_per_gpu": 10873.24, "total_tokens": 1277380924 }, { "epoch": 0.8088272068017004, "grad_norm": 0.9022150635719299, "learning_rate": 2e-05, "loss": 0.6561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12938, "tokens_per_second_per_gpu": 10503.14, "total_tokens": 1277479747 }, { "epoch": 0.8088897224306076, "grad_norm": 0.9015203714370728, "learning_rate": 2e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12939, "tokens_per_second_per_gpu": 10724.99, "total_tokens": 1277578916 }, { "epoch": 0.8089522380595149, "grad_norm": 0.9203551411628723, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12940, "tokens_per_second_per_gpu": 10624.3, "total_tokens": 1277678411 }, { "epoch": 0.8090147536884221, "grad_norm": 0.8874899744987488, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12941, "tokens_per_second_per_gpu": 10784.33, "total_tokens": 1277780232 }, { "epoch": 0.8090772693173294, "grad_norm": 0.9705468416213989, "learning_rate": 2e-05, "loss": 0.6247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12942, "tokens_per_second_per_gpu": 11052.26, "total_tokens": 1277881801 }, { "epoch": 0.8091397849462365, "grad_norm": 0.9426804184913635, "learning_rate": 2e-05, "loss": 0.6206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12943, "tokens_per_second_per_gpu": 9809.18, "total_tokens": 1277975659 }, { "epoch": 0.8092023005751438, "grad_norm": 0.8776766061782837, "learning_rate": 2e-05, "loss": 0.6462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12944, "tokens_per_second_per_gpu": 11406.83, "total_tokens": 1278077908 }, { "epoch": 0.809264816204051, "grad_norm": 0.8973905444145203, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12945, "tokens_per_second_per_gpu": 10530.8, "total_tokens": 1278177341 }, { "epoch": 0.8093273318329582, "grad_norm": 0.8462704420089722, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12946, "tokens_per_second_per_gpu": 10867.62, "total_tokens": 1278281845 }, { "epoch": 0.8093898474618655, "grad_norm": 0.8761277794837952, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12947, "tokens_per_second_per_gpu": 10087.82, "total_tokens": 1278382137 }, { "epoch": 0.8094523630907727, "grad_norm": 0.8695934414863586, "learning_rate": 2e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12948, "tokens_per_second_per_gpu": 10729.47, "total_tokens": 1278482756 }, { "epoch": 0.8095148787196799, "grad_norm": 0.9206820726394653, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12949, "tokens_per_second_per_gpu": 10861.65, "total_tokens": 1278582543 }, { "epoch": 0.8095773943485871, "grad_norm": 0.8518071174621582, "learning_rate": 2e-05, "loss": 0.5969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12950, "tokens_per_second_per_gpu": 11192.31, "total_tokens": 1278684649 }, { "epoch": 0.8096399099774944, "grad_norm": 0.8958786129951477, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12951, "tokens_per_second_per_gpu": 10946.66, "total_tokens": 1278783726 }, { "epoch": 0.8097024256064016, "grad_norm": 0.9611660242080688, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12952, "tokens_per_second_per_gpu": 10610.82, "total_tokens": 1278882302 }, { "epoch": 0.8097649412353088, "grad_norm": 0.8875206708908081, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12953, "tokens_per_second_per_gpu": 10675.17, "total_tokens": 1278983786 }, { "epoch": 0.8098274568642161, "grad_norm": 0.8883710503578186, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12954, "tokens_per_second_per_gpu": 10432.11, "total_tokens": 1279084788 }, { "epoch": 0.8098899724931233, "grad_norm": 0.8806880712509155, "learning_rate": 2e-05, "loss": 0.577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12955, "tokens_per_second_per_gpu": 9904.43, "total_tokens": 1279180320 }, { "epoch": 0.8099524881220305, "grad_norm": 0.9224118590354919, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12956, "tokens_per_second_per_gpu": 10305.55, "total_tokens": 1279277174 }, { "epoch": 0.8100150037509377, "grad_norm": 0.8790350556373596, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12957, "tokens_per_second_per_gpu": 10993.94, "total_tokens": 1279378967 }, { "epoch": 0.810077519379845, "grad_norm": 0.9004177451133728, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12958, "tokens_per_second_per_gpu": 10727.97, "total_tokens": 1279474931 }, { "epoch": 0.8101400350087522, "grad_norm": 0.899582028388977, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12959, "tokens_per_second_per_gpu": 11272.51, "total_tokens": 1279575871 }, { "epoch": 0.8102025506376594, "grad_norm": 0.9184314012527466, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12960, "tokens_per_second_per_gpu": 10408.78, "total_tokens": 1279675884 }, { "epoch": 0.8102650662665667, "grad_norm": 0.8823256492614746, "learning_rate": 2e-05, "loss": 0.6618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12961, "tokens_per_second_per_gpu": 11454.92, "total_tokens": 1279778776 }, { "epoch": 0.8103275818954738, "grad_norm": 0.899435818195343, "learning_rate": 2e-05, "loss": 0.6709, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12962, "tokens_per_second_per_gpu": 10526.66, "total_tokens": 1279878595 }, { "epoch": 0.8103900975243811, "grad_norm": 0.903482973575592, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12963, "tokens_per_second_per_gpu": 10016.39, "total_tokens": 1279976975 }, { "epoch": 0.8104526131532883, "grad_norm": 0.8661934733390808, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12964, "tokens_per_second_per_gpu": 10698.4, "total_tokens": 1280078255 }, { "epoch": 0.8105151287821956, "grad_norm": 0.8801143169403076, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12965, "tokens_per_second_per_gpu": 11255.14, "total_tokens": 1280185557 }, { "epoch": 0.8105776444111028, "grad_norm": 0.8909108638763428, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12966, "tokens_per_second_per_gpu": 10442.08, "total_tokens": 1280287109 }, { "epoch": 0.81064016004001, "grad_norm": 0.8704034090042114, "learning_rate": 2e-05, "loss": 0.5925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12967, "tokens_per_second_per_gpu": 10044.17, "total_tokens": 1280383778 }, { "epoch": 0.8107026756689172, "grad_norm": 0.8616259098052979, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12968, "tokens_per_second_per_gpu": 10031.74, "total_tokens": 1280487226 }, { "epoch": 0.8107651912978244, "grad_norm": 0.8952956199645996, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12969, "tokens_per_second_per_gpu": 10638.09, "total_tokens": 1280588171 }, { "epoch": 0.8108277069267317, "grad_norm": 0.8810846209526062, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12970, "tokens_per_second_per_gpu": 10274.83, "total_tokens": 1280688175 }, { "epoch": 0.8108902225556389, "grad_norm": 0.8909710049629211, "learning_rate": 2e-05, "loss": 0.5906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12971, "tokens_per_second_per_gpu": 10071.4, "total_tokens": 1280783996 }, { "epoch": 0.8109527381845462, "grad_norm": 0.938183069229126, "learning_rate": 2e-05, "loss": 0.589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12972, "tokens_per_second_per_gpu": 10232.56, "total_tokens": 1280877286 }, { "epoch": 0.8110152538134534, "grad_norm": 0.9231818914413452, "learning_rate": 2e-05, "loss": 0.6178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12973, "tokens_per_second_per_gpu": 10238.83, "total_tokens": 1280973615 }, { "epoch": 0.8110777694423605, "grad_norm": 0.9109702110290527, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12974, "tokens_per_second_per_gpu": 10786.63, "total_tokens": 1281072998 }, { "epoch": 0.8111402850712678, "grad_norm": 0.8718274831771851, "learning_rate": 2e-05, "loss": 0.6003, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12975, "tokens_per_second_per_gpu": 10916.57, "total_tokens": 1281171249 }, { "epoch": 0.811202800700175, "grad_norm": 0.8776293396949768, "learning_rate": 2e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12976, "tokens_per_second_per_gpu": 10814.09, "total_tokens": 1281271954 }, { "epoch": 0.8112653163290823, "grad_norm": 0.9145053625106812, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12977, "tokens_per_second_per_gpu": 10964.19, "total_tokens": 1281370165 }, { "epoch": 0.8113278319579895, "grad_norm": 0.8797646164894104, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12978, "tokens_per_second_per_gpu": 9996.43, "total_tokens": 1281467504 }, { "epoch": 0.8113903475868968, "grad_norm": 0.9005329608917236, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12979, "tokens_per_second_per_gpu": 10727.49, "total_tokens": 1281567226 }, { "epoch": 0.8114528632158039, "grad_norm": 0.9138645529747009, "learning_rate": 2e-05, "loss": 0.6413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12980, "tokens_per_second_per_gpu": 10593.86, "total_tokens": 1281669041 }, { "epoch": 0.8115153788447111, "grad_norm": 0.8812393546104431, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12981, "tokens_per_second_per_gpu": 11113.41, "total_tokens": 1281774035 }, { "epoch": 0.8115778944736184, "grad_norm": 0.8760801553726196, "learning_rate": 2e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12982, "tokens_per_second_per_gpu": 11469.56, "total_tokens": 1281876687 }, { "epoch": 0.8116404101025256, "grad_norm": 0.8392961621284485, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12983, "tokens_per_second_per_gpu": 11418.22, "total_tokens": 1281980968 }, { "epoch": 0.8117029257314329, "grad_norm": 0.9027735590934753, "learning_rate": 2e-05, "loss": 0.5694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12984, "tokens_per_second_per_gpu": 10026.45, "total_tokens": 1282073355 }, { "epoch": 0.8117654413603401, "grad_norm": 0.8916245698928833, "learning_rate": 2e-05, "loss": 0.6584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12985, "tokens_per_second_per_gpu": 10896.67, "total_tokens": 1282177874 }, { "epoch": 0.8118279569892473, "grad_norm": 0.8702120780944824, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12986, "tokens_per_second_per_gpu": 10206.39, "total_tokens": 1282275971 }, { "epoch": 0.8118904726181545, "grad_norm": 0.8958612680435181, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12987, "tokens_per_second_per_gpu": 11146.7, "total_tokens": 1282377958 }, { "epoch": 0.8119529882470617, "grad_norm": 0.9027810096740723, "learning_rate": 2e-05, "loss": 0.5839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12988, "tokens_per_second_per_gpu": 10243.07, "total_tokens": 1282473580 }, { "epoch": 0.812015503875969, "grad_norm": 0.9168592691421509, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12989, "tokens_per_second_per_gpu": 11138.34, "total_tokens": 1282570538 }, { "epoch": 0.8120780195048762, "grad_norm": 0.881575882434845, "learning_rate": 2e-05, "loss": 0.6057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12990, "tokens_per_second_per_gpu": 9963.25, "total_tokens": 1282668203 }, { "epoch": 0.8121405351337835, "grad_norm": 0.867139995098114, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12991, "tokens_per_second_per_gpu": 10740.14, "total_tokens": 1282769541 }, { "epoch": 0.8122030507626907, "grad_norm": 0.868996262550354, "learning_rate": 2e-05, "loss": 0.557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12992, "tokens_per_second_per_gpu": 10598.95, "total_tokens": 1282864547 }, { "epoch": 0.8122655663915979, "grad_norm": 0.8939544558525085, "learning_rate": 2e-05, "loss": 0.5917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12993, "tokens_per_second_per_gpu": 10581.4, "total_tokens": 1282965259 }, { "epoch": 0.8123280820205051, "grad_norm": 0.8456668257713318, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12994, "tokens_per_second_per_gpu": 10617.68, "total_tokens": 1283068234 }, { "epoch": 0.8123905976494123, "grad_norm": 0.8956769704818726, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12995, "tokens_per_second_per_gpu": 10460.16, "total_tokens": 1283168993 }, { "epoch": 0.8124531132783196, "grad_norm": 0.8930708765983582, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12996, "tokens_per_second_per_gpu": 10967.61, "total_tokens": 1283271246 }, { "epoch": 0.8125156289072268, "grad_norm": 0.8570277094841003, "learning_rate": 2e-05, "loss": 0.5974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12997, "tokens_per_second_per_gpu": 11347.22, "total_tokens": 1283372052 }, { "epoch": 0.8125781445361341, "grad_norm": 0.8809077143669128, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12998, "tokens_per_second_per_gpu": 10338.17, "total_tokens": 1283469745 }, { "epoch": 0.8126406601650412, "grad_norm": 0.9016430974006653, "learning_rate": 2e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 12999, "tokens_per_second_per_gpu": 10725.13, "total_tokens": 1283570150 }, { "epoch": 0.8127031757939485, "grad_norm": 0.9228811860084534, "learning_rate": 2e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13000, "tokens_per_second_per_gpu": 10732.24, "total_tokens": 1283668069 }, { "epoch": 0.8127656914228557, "grad_norm": 0.891916811466217, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13001, "tokens_per_second_per_gpu": 9952.26, "total_tokens": 1283765331 }, { "epoch": 0.812828207051763, "grad_norm": 0.8794962763786316, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13002, "tokens_per_second_per_gpu": 9882.62, "total_tokens": 1283863944 }, { "epoch": 0.8128907226806702, "grad_norm": 0.8890083432197571, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13003, "tokens_per_second_per_gpu": 10750.76, "total_tokens": 1283964424 }, { "epoch": 0.8129532383095774, "grad_norm": 0.9685178399085999, "learning_rate": 2e-05, "loss": 0.6595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13004, "tokens_per_second_per_gpu": 9770.04, "total_tokens": 1284057900 }, { "epoch": 0.8130157539384846, "grad_norm": 0.8886628746986389, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13005, "tokens_per_second_per_gpu": 10201.73, "total_tokens": 1284153379 }, { "epoch": 0.8130782695673918, "grad_norm": 0.8812659382820129, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13006, "tokens_per_second_per_gpu": 11210.11, "total_tokens": 1284253030 }, { "epoch": 0.8131407851962991, "grad_norm": 0.8896257877349854, "learning_rate": 2e-05, "loss": 0.576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13007, "tokens_per_second_per_gpu": 10578.6, "total_tokens": 1284351416 }, { "epoch": 0.8132033008252063, "grad_norm": 0.9276590943336487, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13008, "tokens_per_second_per_gpu": 10334.82, "total_tokens": 1284449922 }, { "epoch": 0.8132658164541136, "grad_norm": 0.9133450984954834, "learning_rate": 2e-05, "loss": 0.5843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13009, "tokens_per_second_per_gpu": 9703.95, "total_tokens": 1284544386 }, { "epoch": 0.8133283320830208, "grad_norm": 0.9008229970932007, "learning_rate": 2e-05, "loss": 0.5851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13010, "tokens_per_second_per_gpu": 10762.39, "total_tokens": 1284641540 }, { "epoch": 0.8133908477119279, "grad_norm": 0.8634399771690369, "learning_rate": 2e-05, "loss": 0.595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13011, "tokens_per_second_per_gpu": 10566.29, "total_tokens": 1284739253 }, { "epoch": 0.8134533633408352, "grad_norm": 0.8451958894729614, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13012, "tokens_per_second_per_gpu": 11055.29, "total_tokens": 1284844081 }, { "epoch": 0.8135158789697424, "grad_norm": 0.8762065172195435, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13013, "tokens_per_second_per_gpu": 9765.92, "total_tokens": 1284939178 }, { "epoch": 0.8135783945986497, "grad_norm": 0.8876738548278809, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13014, "tokens_per_second_per_gpu": 10724.89, "total_tokens": 1285038572 }, { "epoch": 0.8136409102275569, "grad_norm": 0.9462924599647522, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13015, "tokens_per_second_per_gpu": 10501.22, "total_tokens": 1285137014 }, { "epoch": 0.8137034258564642, "grad_norm": 0.8571653962135315, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13016, "tokens_per_second_per_gpu": 10948.07, "total_tokens": 1285240287 }, { "epoch": 0.8137659414853713, "grad_norm": 0.8892654776573181, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13017, "tokens_per_second_per_gpu": 10054.99, "total_tokens": 1285337790 }, { "epoch": 0.8138284571142785, "grad_norm": 0.9151768684387207, "learning_rate": 2e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13018, "tokens_per_second_per_gpu": 10210.3, "total_tokens": 1285437383 }, { "epoch": 0.8138909727431858, "grad_norm": 0.8883031010627747, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13019, "tokens_per_second_per_gpu": 10779.47, "total_tokens": 1285537292 }, { "epoch": 0.813953488372093, "grad_norm": 0.9050011038780212, "learning_rate": 2e-05, "loss": 0.6482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13020, "tokens_per_second_per_gpu": 10756.02, "total_tokens": 1285637908 }, { "epoch": 0.8140160040010003, "grad_norm": 0.9065563082695007, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13021, "tokens_per_second_per_gpu": 10772.77, "total_tokens": 1285738894 }, { "epoch": 0.8140785196299075, "grad_norm": 0.9159702062606812, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13022, "tokens_per_second_per_gpu": 10400.27, "total_tokens": 1285838923 }, { "epoch": 0.8141410352588146, "grad_norm": 0.9130274653434753, "learning_rate": 2e-05, "loss": 0.6628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13023, "tokens_per_second_per_gpu": 10888.74, "total_tokens": 1285942369 }, { "epoch": 0.8142035508877219, "grad_norm": 0.8753762245178223, "learning_rate": 2e-05, "loss": 0.5927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13024, "tokens_per_second_per_gpu": 10587.69, "total_tokens": 1286041824 }, { "epoch": 0.8142660665166291, "grad_norm": 0.9001593589782715, "learning_rate": 2e-05, "loss": 0.5916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13025, "tokens_per_second_per_gpu": 10463.25, "total_tokens": 1286136301 }, { "epoch": 0.8143285821455364, "grad_norm": 0.9030860662460327, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13026, "tokens_per_second_per_gpu": 11273.87, "total_tokens": 1286238164 }, { "epoch": 0.8143910977744436, "grad_norm": 0.900233268737793, "learning_rate": 2e-05, "loss": 0.6348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13027, "tokens_per_second_per_gpu": 10888.14, "total_tokens": 1286338394 }, { "epoch": 0.8144536134033509, "grad_norm": 0.9287874102592468, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13028, "tokens_per_second_per_gpu": 10361.57, "total_tokens": 1286438707 }, { "epoch": 0.8145161290322581, "grad_norm": 0.8703992962837219, "learning_rate": 2e-05, "loss": 0.5772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13029, "tokens_per_second_per_gpu": 9748.05, "total_tokens": 1286533679 }, { "epoch": 0.8145786446611653, "grad_norm": 0.9029073715209961, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13030, "tokens_per_second_per_gpu": 10691.69, "total_tokens": 1286631479 }, { "epoch": 0.8146411602900725, "grad_norm": 0.8751430511474609, "learning_rate": 2e-05, "loss": 0.5538, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13031, "tokens_per_second_per_gpu": 10150.01, "total_tokens": 1286725854 }, { "epoch": 0.8147036759189797, "grad_norm": 0.8684191703796387, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13032, "tokens_per_second_per_gpu": 10506.48, "total_tokens": 1286829784 }, { "epoch": 0.814766191547887, "grad_norm": 0.834115207195282, "learning_rate": 2e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13033, "tokens_per_second_per_gpu": 11081.14, "total_tokens": 1286932220 }, { "epoch": 0.8148287071767942, "grad_norm": 0.8652623295783997, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13034, "tokens_per_second_per_gpu": 11300.61, "total_tokens": 1287038469 }, { "epoch": 0.8148912228057015, "grad_norm": 0.9068727493286133, "learning_rate": 2e-05, "loss": 0.6522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13035, "tokens_per_second_per_gpu": 11177.53, "total_tokens": 1287141545 }, { "epoch": 0.8149537384346086, "grad_norm": 0.921515941619873, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13036, "tokens_per_second_per_gpu": 11560.09, "total_tokens": 1287245622 }, { "epoch": 0.8150162540635159, "grad_norm": 0.8815343379974365, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13037, "tokens_per_second_per_gpu": 10718.4, "total_tokens": 1287343123 }, { "epoch": 0.8150787696924231, "grad_norm": 0.8720484375953674, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13038, "tokens_per_second_per_gpu": 10770.77, "total_tokens": 1287440510 }, { "epoch": 0.8151412853213303, "grad_norm": 0.9079184532165527, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13039, "tokens_per_second_per_gpu": 10101.48, "total_tokens": 1287537443 }, { "epoch": 0.8152038009502376, "grad_norm": 0.8771799802780151, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13040, "tokens_per_second_per_gpu": 10662.14, "total_tokens": 1287634746 }, { "epoch": 0.8152663165791448, "grad_norm": 0.9077622890472412, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13041, "tokens_per_second_per_gpu": 9799.5, "total_tokens": 1287730451 }, { "epoch": 0.815328832208052, "grad_norm": 0.9032795429229736, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13042, "tokens_per_second_per_gpu": 11028.8, "total_tokens": 1287831635 }, { "epoch": 0.8153913478369592, "grad_norm": 0.896435022354126, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13043, "tokens_per_second_per_gpu": 9847.69, "total_tokens": 1287931526 }, { "epoch": 0.8154538634658665, "grad_norm": 0.8490618467330933, "learning_rate": 2e-05, "loss": 0.5977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13044, "tokens_per_second_per_gpu": 10652.81, "total_tokens": 1288030708 }, { "epoch": 0.8155163790947737, "grad_norm": 0.9037817120552063, "learning_rate": 2e-05, "loss": 0.6302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13045, "tokens_per_second_per_gpu": 10454.79, "total_tokens": 1288129049 }, { "epoch": 0.815578894723681, "grad_norm": 0.8694532513618469, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13046, "tokens_per_second_per_gpu": 10359.44, "total_tokens": 1288229188 }, { "epoch": 0.8156414103525882, "grad_norm": 0.9287206530570984, "learning_rate": 2e-05, "loss": 0.6467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13047, "tokens_per_second_per_gpu": 10858.99, "total_tokens": 1288325518 }, { "epoch": 0.8157039259814953, "grad_norm": 0.8715130090713501, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13048, "tokens_per_second_per_gpu": 10717.33, "total_tokens": 1288425974 }, { "epoch": 0.8157664416104026, "grad_norm": 0.8872639536857605, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13049, "tokens_per_second_per_gpu": 9820.71, "total_tokens": 1288520667 }, { "epoch": 0.8158289572393098, "grad_norm": 0.8963018655776978, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13050, "tokens_per_second_per_gpu": 11257.93, "total_tokens": 1288617820 }, { "epoch": 0.8158914728682171, "grad_norm": 0.8765252232551575, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13051, "tokens_per_second_per_gpu": 10752.83, "total_tokens": 1288721121 }, { "epoch": 0.8159539884971243, "grad_norm": 0.881539523601532, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13052, "tokens_per_second_per_gpu": 10440.11, "total_tokens": 1288821495 }, { "epoch": 0.8160165041260315, "grad_norm": 0.8652839064598083, "learning_rate": 2e-05, "loss": 0.6066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13053, "tokens_per_second_per_gpu": 10435.61, "total_tokens": 1288922378 }, { "epoch": 0.8160790197549387, "grad_norm": 0.8469005227088928, "learning_rate": 2e-05, "loss": 0.5989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13054, "tokens_per_second_per_gpu": 11023.19, "total_tokens": 1289023818 }, { "epoch": 0.8161415353838459, "grad_norm": 0.8502302169799805, "learning_rate": 2e-05, "loss": 0.5851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13055, "tokens_per_second_per_gpu": 10484.59, "total_tokens": 1289122803 }, { "epoch": 0.8162040510127532, "grad_norm": 0.9387757778167725, "learning_rate": 2e-05, "loss": 0.6691, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13056, "tokens_per_second_per_gpu": 10745.34, "total_tokens": 1289219277 }, { "epoch": 0.8162665666416604, "grad_norm": 0.8839127421379089, "learning_rate": 2e-05, "loss": 0.5605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13057, "tokens_per_second_per_gpu": 10003.54, "total_tokens": 1289312870 }, { "epoch": 0.8163290822705677, "grad_norm": 0.865004301071167, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13058, "tokens_per_second_per_gpu": 10908.48, "total_tokens": 1289414320 }, { "epoch": 0.8163915978994749, "grad_norm": 0.868436872959137, "learning_rate": 2e-05, "loss": 0.646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13059, "tokens_per_second_per_gpu": 10786.07, "total_tokens": 1289513529 }, { "epoch": 0.816454113528382, "grad_norm": 0.8518000841140747, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13060, "tokens_per_second_per_gpu": 11070.41, "total_tokens": 1289615704 }, { "epoch": 0.8165166291572893, "grad_norm": 0.9019725322723389, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13061, "tokens_per_second_per_gpu": 10254.35, "total_tokens": 1289712675 }, { "epoch": 0.8165791447861965, "grad_norm": 0.8564127683639526, "learning_rate": 2e-05, "loss": 0.5955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13062, "tokens_per_second_per_gpu": 10832.99, "total_tokens": 1289813818 }, { "epoch": 0.8166416604151038, "grad_norm": 0.8709256649017334, "learning_rate": 2e-05, "loss": 0.5821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13063, "tokens_per_second_per_gpu": 10822.75, "total_tokens": 1289913687 }, { "epoch": 0.816704176044011, "grad_norm": 0.9381892085075378, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13064, "tokens_per_second_per_gpu": 10674.31, "total_tokens": 1290011920 }, { "epoch": 0.8167666916729183, "grad_norm": 0.8882848620414734, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13065, "tokens_per_second_per_gpu": 10954.47, "total_tokens": 1290112989 }, { "epoch": 0.8168292073018255, "grad_norm": 0.9220638275146484, "learning_rate": 2e-05, "loss": 0.6481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13066, "tokens_per_second_per_gpu": 10411.26, "total_tokens": 1290210999 }, { "epoch": 0.8168917229307326, "grad_norm": 0.8799504041671753, "learning_rate": 2e-05, "loss": 0.5966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13067, "tokens_per_second_per_gpu": 10699.91, "total_tokens": 1290309307 }, { "epoch": 0.8169542385596399, "grad_norm": 0.8776509165763855, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13068, "tokens_per_second_per_gpu": 10438.59, "total_tokens": 1290406576 }, { "epoch": 0.8170167541885471, "grad_norm": 0.9017881155014038, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13069, "tokens_per_second_per_gpu": 10563.36, "total_tokens": 1290506243 }, { "epoch": 0.8170792698174544, "grad_norm": 0.8828062415122986, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13070, "tokens_per_second_per_gpu": 17242.72, "total_tokens": 1290604084 }, { "epoch": 0.8171417854463616, "grad_norm": 0.8934770822525024, "learning_rate": 2e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13071, "tokens_per_second_per_gpu": 17715.65, "total_tokens": 1290705883 }, { "epoch": 0.8172043010752689, "grad_norm": 0.9384058713912964, "learning_rate": 2e-05, "loss": 0.6775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13072, "tokens_per_second_per_gpu": 17715.42, "total_tokens": 1290807413 }, { "epoch": 0.817266816704176, "grad_norm": 0.8868180513381958, "learning_rate": 2e-05, "loss": 0.6741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13073, "tokens_per_second_per_gpu": 17932.77, "total_tokens": 1290910427 }, { "epoch": 0.8173293323330832, "grad_norm": 0.8765392899513245, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13074, "tokens_per_second_per_gpu": 17690.17, "total_tokens": 1291009221 }, { "epoch": 0.8173918479619905, "grad_norm": 0.8808318376541138, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13075, "tokens_per_second_per_gpu": 17176.74, "total_tokens": 1291108992 }, { "epoch": 0.8174543635908977, "grad_norm": 0.8969024419784546, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13076, "tokens_per_second_per_gpu": 17504.62, "total_tokens": 1291208206 }, { "epoch": 0.817516879219805, "grad_norm": 0.9171181321144104, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13077, "tokens_per_second_per_gpu": 17078.93, "total_tokens": 1291307405 }, { "epoch": 0.8175793948487122, "grad_norm": 0.8589670062065125, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13078, "tokens_per_second_per_gpu": 17153.0, "total_tokens": 1291407906 }, { "epoch": 0.8176419104776194, "grad_norm": 0.8695279955863953, "learning_rate": 2e-05, "loss": 0.6448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13079, "tokens_per_second_per_gpu": 18166.95, "total_tokens": 1291508616 }, { "epoch": 0.8177044261065266, "grad_norm": 0.8734322786331177, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13080, "tokens_per_second_per_gpu": 17264.78, "total_tokens": 1291607400 }, { "epoch": 0.8177669417354338, "grad_norm": 0.9643712639808655, "learning_rate": 2e-05, "loss": 0.5754, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13081, "tokens_per_second_per_gpu": 13848.04, "total_tokens": 1291704456 }, { "epoch": 0.8178294573643411, "grad_norm": 0.858277440071106, "learning_rate": 2e-05, "loss": 0.5779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13082, "tokens_per_second_per_gpu": 15230.18, "total_tokens": 1291804085 }, { "epoch": 0.8178919729932483, "grad_norm": 0.9587333798408508, "learning_rate": 2e-05, "loss": 0.5999, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13083, "tokens_per_second_per_gpu": 16633.45, "total_tokens": 1291902230 }, { "epoch": 0.8179544886221556, "grad_norm": 0.879860520362854, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13084, "tokens_per_second_per_gpu": 17206.0, "total_tokens": 1292004697 }, { "epoch": 0.8180170042510627, "grad_norm": 0.9066539406776428, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13085, "tokens_per_second_per_gpu": 12226.46, "total_tokens": 1292103218 }, { "epoch": 0.81807951987997, "grad_norm": 0.8924917578697205, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13086, "tokens_per_second_per_gpu": 9495.91, "total_tokens": 1292200239 }, { "epoch": 0.8181420355088772, "grad_norm": 0.8718509078025818, "learning_rate": 2e-05, "loss": 0.5773, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13087, "tokens_per_second_per_gpu": 10117.9, "total_tokens": 1292297101 }, { "epoch": 0.8182045511377845, "grad_norm": 0.8637275695800781, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13088, "tokens_per_second_per_gpu": 10592.37, "total_tokens": 1292399153 }, { "epoch": 0.8182670667666917, "grad_norm": 0.8821967840194702, "learning_rate": 2e-05, "loss": 0.6195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13089, "tokens_per_second_per_gpu": 11210.65, "total_tokens": 1292501355 }, { "epoch": 0.8183295823955989, "grad_norm": 0.9053105711936951, "learning_rate": 2e-05, "loss": 0.6502, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13090, "tokens_per_second_per_gpu": 10451.38, "total_tokens": 1292599235 }, { "epoch": 0.8183920980245061, "grad_norm": 0.8971763849258423, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13091, "tokens_per_second_per_gpu": 10707.57, "total_tokens": 1292699825 }, { "epoch": 0.8184546136534133, "grad_norm": 0.8742074370384216, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13092, "tokens_per_second_per_gpu": 10473.74, "total_tokens": 1292797545 }, { "epoch": 0.8185171292823206, "grad_norm": 0.8786153197288513, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13093, "tokens_per_second_per_gpu": 9447.3, "total_tokens": 1292890780 }, { "epoch": 0.8185796449112278, "grad_norm": 0.8838868141174316, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13094, "tokens_per_second_per_gpu": 9940.21, "total_tokens": 1292988989 }, { "epoch": 0.818642160540135, "grad_norm": 0.9177571535110474, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13095, "tokens_per_second_per_gpu": 10917.96, "total_tokens": 1293086553 }, { "epoch": 0.8187046761690423, "grad_norm": 0.9012125730514526, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13096, "tokens_per_second_per_gpu": 10583.29, "total_tokens": 1293187435 }, { "epoch": 0.8187671917979494, "grad_norm": 0.8591479659080505, "learning_rate": 2e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13097, "tokens_per_second_per_gpu": 10913.71, "total_tokens": 1293288476 }, { "epoch": 0.8188297074268567, "grad_norm": 0.8733076453208923, "learning_rate": 2e-05, "loss": 0.5791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13098, "tokens_per_second_per_gpu": 11099.26, "total_tokens": 1293385960 }, { "epoch": 0.8188922230557639, "grad_norm": 0.9217618703842163, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13099, "tokens_per_second_per_gpu": 10878.21, "total_tokens": 1293481802 }, { "epoch": 0.8189547386846712, "grad_norm": 0.9282904863357544, "learning_rate": 2e-05, "loss": 0.5772, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13100, "tokens_per_second_per_gpu": 10273.22, "total_tokens": 1293568506 }, { "epoch": 0.8190172543135784, "grad_norm": 0.9075071811676025, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13101, "tokens_per_second_per_gpu": 10312.26, "total_tokens": 1293664235 }, { "epoch": 0.8190797699424857, "grad_norm": 0.8560483455657959, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13102, "tokens_per_second_per_gpu": 11597.18, "total_tokens": 1293771849 }, { "epoch": 0.8191422855713929, "grad_norm": 0.8791046142578125, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13103, "tokens_per_second_per_gpu": 11609.51, "total_tokens": 1293876529 }, { "epoch": 0.8192048012003, "grad_norm": 0.9020381569862366, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13104, "tokens_per_second_per_gpu": 11372.66, "total_tokens": 1293976436 }, { "epoch": 0.8192673168292073, "grad_norm": 0.8727266192436218, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13105, "tokens_per_second_per_gpu": 11105.56, "total_tokens": 1294073510 }, { "epoch": 0.8193298324581145, "grad_norm": 0.8495321273803711, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13106, "tokens_per_second_per_gpu": 11150.25, "total_tokens": 1294173869 }, { "epoch": 0.8193923480870218, "grad_norm": 0.8920471668243408, "learning_rate": 2e-05, "loss": 0.5962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13107, "tokens_per_second_per_gpu": 9868.16, "total_tokens": 1294267545 }, { "epoch": 0.819454863715929, "grad_norm": 0.8616462349891663, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13108, "tokens_per_second_per_gpu": 10714.88, "total_tokens": 1294366466 }, { "epoch": 0.8195173793448363, "grad_norm": 0.91157466173172, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13109, "tokens_per_second_per_gpu": 10811.6, "total_tokens": 1294464600 }, { "epoch": 0.8195798949737434, "grad_norm": 0.872292697429657, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13110, "tokens_per_second_per_gpu": 10678.29, "total_tokens": 1294562561 }, { "epoch": 0.8196424106026506, "grad_norm": 0.9254687428474426, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13111, "tokens_per_second_per_gpu": 11408.78, "total_tokens": 1294661965 }, { "epoch": 0.8197049262315579, "grad_norm": 0.8687699437141418, "learning_rate": 2e-05, "loss": 0.6206, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13112, "tokens_per_second_per_gpu": 10285.87, "total_tokens": 1294761109 }, { "epoch": 0.8197674418604651, "grad_norm": 0.8796836137771606, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13113, "tokens_per_second_per_gpu": 10170.42, "total_tokens": 1294860099 }, { "epoch": 0.8198299574893724, "grad_norm": 0.9068730473518372, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13114, "tokens_per_second_per_gpu": 10514.4, "total_tokens": 1294954817 }, { "epoch": 0.8198924731182796, "grad_norm": 0.8852189779281616, "learning_rate": 2e-05, "loss": 0.5864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13115, "tokens_per_second_per_gpu": 9427.68, "total_tokens": 1295046653 }, { "epoch": 0.8199549887471868, "grad_norm": 0.8831503987312317, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13116, "tokens_per_second_per_gpu": 10844.78, "total_tokens": 1295149560 }, { "epoch": 0.820017504376094, "grad_norm": 0.868919849395752, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13117, "tokens_per_second_per_gpu": 10562.81, "total_tokens": 1295249266 }, { "epoch": 0.8200800200050012, "grad_norm": 0.8846012353897095, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13118, "tokens_per_second_per_gpu": 10390.02, "total_tokens": 1295346983 }, { "epoch": 0.8201425356339085, "grad_norm": 0.8788541555404663, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13119, "tokens_per_second_per_gpu": 10886.03, "total_tokens": 1295449344 }, { "epoch": 0.8202050512628157, "grad_norm": 0.906425416469574, "learning_rate": 2e-05, "loss": 0.666, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13120, "tokens_per_second_per_gpu": 10465.92, "total_tokens": 1295548624 }, { "epoch": 0.820267566891723, "grad_norm": 0.892652690410614, "learning_rate": 2e-05, "loss": 0.6439, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13121, "tokens_per_second_per_gpu": 11250.45, "total_tokens": 1295648997 }, { "epoch": 0.8203300825206301, "grad_norm": 0.8915622234344482, "learning_rate": 2e-05, "loss": 0.5806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13122, "tokens_per_second_per_gpu": 9639.62, "total_tokens": 1295738446 }, { "epoch": 0.8203925981495374, "grad_norm": 0.8868075013160706, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13123, "tokens_per_second_per_gpu": 10231.79, "total_tokens": 1295836713 }, { "epoch": 0.8204551137784446, "grad_norm": 0.8679355382919312, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13124, "tokens_per_second_per_gpu": 10776.5, "total_tokens": 1295937885 }, { "epoch": 0.8205176294073518, "grad_norm": 0.9354283809661865, "learning_rate": 2e-05, "loss": 0.5912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13125, "tokens_per_second_per_gpu": 10832.32, "total_tokens": 1296036003 }, { "epoch": 0.8205801450362591, "grad_norm": 0.8609442710876465, "learning_rate": 2e-05, "loss": 0.5917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13126, "tokens_per_second_per_gpu": 10545.48, "total_tokens": 1296135704 }, { "epoch": 0.8206426606651663, "grad_norm": 0.9004719257354736, "learning_rate": 2e-05, "loss": 0.566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13127, "tokens_per_second_per_gpu": 9585.75, "total_tokens": 1296230276 }, { "epoch": 0.8207051762940735, "grad_norm": 0.94227534532547, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13128, "tokens_per_second_per_gpu": 10461.78, "total_tokens": 1296328132 }, { "epoch": 0.8207676919229807, "grad_norm": 0.9011563658714294, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13129, "tokens_per_second_per_gpu": 10840.89, "total_tokens": 1296428240 }, { "epoch": 0.820830207551888, "grad_norm": 0.8720973134040833, "learning_rate": 2e-05, "loss": 0.5697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13130, "tokens_per_second_per_gpu": 10806.02, "total_tokens": 1296525628 }, { "epoch": 0.8208927231807952, "grad_norm": 0.906324565410614, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13131, "tokens_per_second_per_gpu": 10556.25, "total_tokens": 1296622987 }, { "epoch": 0.8209552388097024, "grad_norm": 0.8924976587295532, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13132, "tokens_per_second_per_gpu": 10099.07, "total_tokens": 1296722819 }, { "epoch": 0.8210177544386097, "grad_norm": 0.8893013000488281, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13133, "tokens_per_second_per_gpu": 10826.3, "total_tokens": 1296823917 }, { "epoch": 0.8210802700675168, "grad_norm": 0.9132018685340881, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13134, "tokens_per_second_per_gpu": 10911.93, "total_tokens": 1296920055 }, { "epoch": 0.8211427856964241, "grad_norm": 0.9255030155181885, "learning_rate": 2e-05, "loss": 0.7098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13135, "tokens_per_second_per_gpu": 11282.48, "total_tokens": 1297023943 }, { "epoch": 0.8212053013253313, "grad_norm": 0.9476365447044373, "learning_rate": 2e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13136, "tokens_per_second_per_gpu": 9785.64, "total_tokens": 1297116490 }, { "epoch": 0.8212678169542386, "grad_norm": 0.8705200552940369, "learning_rate": 2e-05, "loss": 0.5908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13137, "tokens_per_second_per_gpu": 10528.44, "total_tokens": 1297215564 }, { "epoch": 0.8213303325831458, "grad_norm": 0.8579442501068115, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13138, "tokens_per_second_per_gpu": 11108.55, "total_tokens": 1297318471 }, { "epoch": 0.821392848212053, "grad_norm": 0.8712074160575867, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13139, "tokens_per_second_per_gpu": 10222.69, "total_tokens": 1297417438 }, { "epoch": 0.8214553638409603, "grad_norm": 0.8836900591850281, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13140, "tokens_per_second_per_gpu": 10870.75, "total_tokens": 1297517182 }, { "epoch": 0.8215178794698674, "grad_norm": 0.8438988327980042, "learning_rate": 2e-05, "loss": 0.5994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13141, "tokens_per_second_per_gpu": 10804.82, "total_tokens": 1297617692 }, { "epoch": 0.8215803950987747, "grad_norm": 0.8955199718475342, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13142, "tokens_per_second_per_gpu": 10303.96, "total_tokens": 1297717112 }, { "epoch": 0.8216429107276819, "grad_norm": 0.878690779209137, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13143, "tokens_per_second_per_gpu": 10212.97, "total_tokens": 1297816755 }, { "epoch": 0.8217054263565892, "grad_norm": 0.8579127788543701, "learning_rate": 2e-05, "loss": 0.6048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13144, "tokens_per_second_per_gpu": 10759.87, "total_tokens": 1297918203 }, { "epoch": 0.8217679419854964, "grad_norm": 0.9338237643241882, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13145, "tokens_per_second_per_gpu": 10753.79, "total_tokens": 1298015751 }, { "epoch": 0.8218304576144037, "grad_norm": 0.9324151873588562, "learning_rate": 2e-05, "loss": 0.6746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13146, "tokens_per_second_per_gpu": 10178.53, "total_tokens": 1298118904 }, { "epoch": 0.8218929732433108, "grad_norm": 0.8790278434753418, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13147, "tokens_per_second_per_gpu": 10890.59, "total_tokens": 1298217443 }, { "epoch": 0.821955488872218, "grad_norm": 0.8949033617973328, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13148, "tokens_per_second_per_gpu": 10969.14, "total_tokens": 1298320214 }, { "epoch": 0.8220180045011253, "grad_norm": 0.890540599822998, "learning_rate": 2e-05, "loss": 0.6646, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13149, "tokens_per_second_per_gpu": 10211.42, "total_tokens": 1298419712 }, { "epoch": 0.8220805201300325, "grad_norm": 0.8863677382469177, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13150, "tokens_per_second_per_gpu": 9953.46, "total_tokens": 1298515110 }, { "epoch": 0.8221430357589398, "grad_norm": 0.8719918727874756, "learning_rate": 2e-05, "loss": 0.5842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13151, "tokens_per_second_per_gpu": 10937.26, "total_tokens": 1298610322 }, { "epoch": 0.822205551387847, "grad_norm": 0.8847877979278564, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13152, "tokens_per_second_per_gpu": 11138.83, "total_tokens": 1298708020 }, { "epoch": 0.8222680670167541, "grad_norm": 0.8915961980819702, "learning_rate": 2e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13153, "tokens_per_second_per_gpu": 10560.01, "total_tokens": 1298809988 }, { "epoch": 0.8223305826456614, "grad_norm": 0.9183420538902283, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13154, "tokens_per_second_per_gpu": 10943.39, "total_tokens": 1298910877 }, { "epoch": 0.8223930982745686, "grad_norm": 0.8887003660202026, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13155, "tokens_per_second_per_gpu": 10164.39, "total_tokens": 1299008481 }, { "epoch": 0.8224556139034759, "grad_norm": 0.8698766231536865, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13156, "tokens_per_second_per_gpu": 10457.76, "total_tokens": 1299108962 }, { "epoch": 0.8225181295323831, "grad_norm": 0.9153850674629211, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13157, "tokens_per_second_per_gpu": 9998.73, "total_tokens": 1299205396 }, { "epoch": 0.8225806451612904, "grad_norm": 0.9142587780952454, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13158, "tokens_per_second_per_gpu": 11012.17, "total_tokens": 1299304616 }, { "epoch": 0.8226431607901975, "grad_norm": 0.8816297650337219, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13159, "tokens_per_second_per_gpu": 11009.02, "total_tokens": 1299406993 }, { "epoch": 0.8227056764191047, "grad_norm": 0.9078806638717651, "learning_rate": 2e-05, "loss": 0.6734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13160, "tokens_per_second_per_gpu": 11167.95, "total_tokens": 1299509724 }, { "epoch": 0.822768192048012, "grad_norm": 0.9072346687316895, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13161, "tokens_per_second_per_gpu": 10948.67, "total_tokens": 1299609458 }, { "epoch": 0.8228307076769192, "grad_norm": 0.926697850227356, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13162, "tokens_per_second_per_gpu": 10436.48, "total_tokens": 1299711892 }, { "epoch": 0.8228932233058265, "grad_norm": 0.8822050094604492, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13163, "tokens_per_second_per_gpu": 11107.89, "total_tokens": 1299814395 }, { "epoch": 0.8229557389347337, "grad_norm": 0.9464782476425171, "learning_rate": 2e-05, "loss": 0.6422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13164, "tokens_per_second_per_gpu": 10293.13, "total_tokens": 1299905257 }, { "epoch": 0.8230182545636409, "grad_norm": 0.8711404800415039, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13165, "tokens_per_second_per_gpu": 11704.35, "total_tokens": 1300010262 }, { "epoch": 0.8230807701925481, "grad_norm": 0.8862766027450562, "learning_rate": 2e-05, "loss": 0.6482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13166, "tokens_per_second_per_gpu": 11099.93, "total_tokens": 1300109550 }, { "epoch": 0.8231432858214554, "grad_norm": 0.9085150957107544, "learning_rate": 2e-05, "loss": 0.5878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13167, "tokens_per_second_per_gpu": 10279.71, "total_tokens": 1300207919 }, { "epoch": 0.8232058014503626, "grad_norm": 0.8891993165016174, "learning_rate": 2e-05, "loss": 0.5767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13168, "tokens_per_second_per_gpu": 9250.64, "total_tokens": 1300303005 }, { "epoch": 0.8232683170792698, "grad_norm": 0.9033852815628052, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13169, "tokens_per_second_per_gpu": 10054.5, "total_tokens": 1300400809 }, { "epoch": 0.8233308327081771, "grad_norm": 0.9081894159317017, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13170, "tokens_per_second_per_gpu": 10663.95, "total_tokens": 1300500086 }, { "epoch": 0.8233933483370842, "grad_norm": 0.8836304545402527, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13171, "tokens_per_second_per_gpu": 10080.89, "total_tokens": 1300594828 }, { "epoch": 0.8234558639659915, "grad_norm": 0.9026042819023132, "learning_rate": 2e-05, "loss": 0.5844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13172, "tokens_per_second_per_gpu": 10238.4, "total_tokens": 1300692750 }, { "epoch": 0.8235183795948987, "grad_norm": 0.8481881022453308, "learning_rate": 2e-05, "loss": 0.589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13173, "tokens_per_second_per_gpu": 10662.42, "total_tokens": 1300791918 }, { "epoch": 0.823580895223806, "grad_norm": 0.8823883533477783, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13174, "tokens_per_second_per_gpu": 10512.64, "total_tokens": 1300889366 }, { "epoch": 0.8236434108527132, "grad_norm": 0.8645136952400208, "learning_rate": 2e-05, "loss": 0.6715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13175, "tokens_per_second_per_gpu": 11092.66, "total_tokens": 1300993361 }, { "epoch": 0.8237059264816204, "grad_norm": 0.900617778301239, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13176, "tokens_per_second_per_gpu": 10445.12, "total_tokens": 1301093077 }, { "epoch": 0.8237684421105276, "grad_norm": 0.8952859044075012, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13177, "tokens_per_second_per_gpu": 10951.16, "total_tokens": 1301191819 }, { "epoch": 0.8238309577394348, "grad_norm": 0.8697735667228699, "learning_rate": 2e-05, "loss": 0.586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13178, "tokens_per_second_per_gpu": 9768.73, "total_tokens": 1301285742 }, { "epoch": 0.8238934733683421, "grad_norm": 0.8934881687164307, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13179, "tokens_per_second_per_gpu": 10466.7, "total_tokens": 1301384098 }, { "epoch": 0.8239559889972493, "grad_norm": 0.8888919949531555, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13180, "tokens_per_second_per_gpu": 10470.8, "total_tokens": 1301481294 }, { "epoch": 0.8240185046261566, "grad_norm": 0.9114753603935242, "learning_rate": 2e-05, "loss": 0.7049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13181, "tokens_per_second_per_gpu": 11178.86, "total_tokens": 1301585603 }, { "epoch": 0.8240810202550638, "grad_norm": 0.8919682502746582, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13182, "tokens_per_second_per_gpu": 11070.1, "total_tokens": 1301689180 }, { "epoch": 0.824143535883971, "grad_norm": 0.8989500403404236, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13183, "tokens_per_second_per_gpu": 10800.84, "total_tokens": 1301790053 }, { "epoch": 0.8242060515128782, "grad_norm": 0.8913388848304749, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13184, "tokens_per_second_per_gpu": 10716.34, "total_tokens": 1301889187 }, { "epoch": 0.8242685671417854, "grad_norm": 0.8725821375846863, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13185, "tokens_per_second_per_gpu": 10359.07, "total_tokens": 1301988061 }, { "epoch": 0.8243310827706927, "grad_norm": 0.8843374252319336, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13186, "tokens_per_second_per_gpu": 11594.18, "total_tokens": 1302090965 }, { "epoch": 0.8243935983995999, "grad_norm": 0.9247940182685852, "learning_rate": 2e-05, "loss": 0.5813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13187, "tokens_per_second_per_gpu": 9991.81, "total_tokens": 1302183702 }, { "epoch": 0.8244561140285072, "grad_norm": 0.9122527241706848, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13188, "tokens_per_second_per_gpu": 10021.61, "total_tokens": 1302282309 }, { "epoch": 0.8245186296574144, "grad_norm": 0.9128571152687073, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13189, "tokens_per_second_per_gpu": 10435.41, "total_tokens": 1302377453 }, { "epoch": 0.8245811452863215, "grad_norm": 0.9024536609649658, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13190, "tokens_per_second_per_gpu": 10486.0, "total_tokens": 1302476764 }, { "epoch": 0.8246436609152288, "grad_norm": 0.89231938123703, "learning_rate": 2e-05, "loss": 0.5843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13191, "tokens_per_second_per_gpu": 10626.19, "total_tokens": 1302572172 }, { "epoch": 0.824706176544136, "grad_norm": 0.8499667644500732, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13192, "tokens_per_second_per_gpu": 10434.89, "total_tokens": 1302673751 }, { "epoch": 0.8247686921730433, "grad_norm": 0.9017852544784546, "learning_rate": 2e-05, "loss": 0.5906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13193, "tokens_per_second_per_gpu": 10997.57, "total_tokens": 1302769921 }, { "epoch": 0.8248312078019505, "grad_norm": 0.9038375020027161, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13194, "tokens_per_second_per_gpu": 11241.91, "total_tokens": 1302870165 }, { "epoch": 0.8248937234308578, "grad_norm": 0.8873811960220337, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13195, "tokens_per_second_per_gpu": 10379.74, "total_tokens": 1302971815 }, { "epoch": 0.8249562390597649, "grad_norm": 0.8498952984809875, "learning_rate": 2e-05, "loss": 0.5895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13196, "tokens_per_second_per_gpu": 9705.23, "total_tokens": 1303066876 }, { "epoch": 0.8250187546886721, "grad_norm": 0.8652504682540894, "learning_rate": 2e-05, "loss": 0.5901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13197, "tokens_per_second_per_gpu": 9350.73, "total_tokens": 1303162195 }, { "epoch": 0.8250812703175794, "grad_norm": 0.8827142715454102, "learning_rate": 2e-05, "loss": 0.663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13198, "tokens_per_second_per_gpu": 10899.72, "total_tokens": 1303263022 }, { "epoch": 0.8251437859464866, "grad_norm": 0.87953120470047, "learning_rate": 2e-05, "loss": 0.5926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13199, "tokens_per_second_per_gpu": 10450.14, "total_tokens": 1303360501 }, { "epoch": 0.8252063015753939, "grad_norm": 0.8611106276512146, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13200, "tokens_per_second_per_gpu": 10942.05, "total_tokens": 1303460227 }, { "epoch": 0.8252688172043011, "grad_norm": 0.9231119155883789, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13201, "tokens_per_second_per_gpu": 10778.68, "total_tokens": 1303559268 }, { "epoch": 0.8253313328332083, "grad_norm": 0.8958086967468262, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13202, "tokens_per_second_per_gpu": 10702.93, "total_tokens": 1303654877 }, { "epoch": 0.8253938484621155, "grad_norm": 0.8861986994743347, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13203, "tokens_per_second_per_gpu": 10590.81, "total_tokens": 1303752235 }, { "epoch": 0.8254563640910227, "grad_norm": 0.8831300735473633, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13204, "tokens_per_second_per_gpu": 10613.32, "total_tokens": 1303851598 }, { "epoch": 0.82551887971993, "grad_norm": 0.8869739174842834, "learning_rate": 2e-05, "loss": 0.5829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13205, "tokens_per_second_per_gpu": 10316.23, "total_tokens": 1303948888 }, { "epoch": 0.8255813953488372, "grad_norm": 0.8907491564750671, "learning_rate": 2e-05, "loss": 0.5593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13206, "tokens_per_second_per_gpu": 10491.86, "total_tokens": 1304043650 }, { "epoch": 0.8256439109777445, "grad_norm": 0.854931116104126, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13207, "tokens_per_second_per_gpu": 10065.21, "total_tokens": 1304141826 }, { "epoch": 0.8257064266066516, "grad_norm": 0.8968482613563538, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13208, "tokens_per_second_per_gpu": 10730.0, "total_tokens": 1304237455 }, { "epoch": 0.8257689422355589, "grad_norm": 0.879870593547821, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13209, "tokens_per_second_per_gpu": 10411.5, "total_tokens": 1304336948 }, { "epoch": 0.8258314578644661, "grad_norm": 0.8785210251808167, "learning_rate": 2e-05, "loss": 0.6478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13210, "tokens_per_second_per_gpu": 11252.2, "total_tokens": 1304439781 }, { "epoch": 0.8258939734933733, "grad_norm": 0.8791254758834839, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13211, "tokens_per_second_per_gpu": 9603.87, "total_tokens": 1304536811 }, { "epoch": 0.8259564891222806, "grad_norm": 0.8556339144706726, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13212, "tokens_per_second_per_gpu": 11072.97, "total_tokens": 1304638214 }, { "epoch": 0.8260190047511878, "grad_norm": 0.8792740106582642, "learning_rate": 2e-05, "loss": 0.5913, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13213, "tokens_per_second_per_gpu": 10839.97, "total_tokens": 1304741230 }, { "epoch": 0.826081520380095, "grad_norm": 0.9063394069671631, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13214, "tokens_per_second_per_gpu": 10160.58, "total_tokens": 1304841471 }, { "epoch": 0.8261440360090022, "grad_norm": 0.9214917421340942, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13215, "tokens_per_second_per_gpu": 10267.23, "total_tokens": 1304941503 }, { "epoch": 0.8262065516379095, "grad_norm": 0.8788858652114868, "learning_rate": 2e-05, "loss": 0.6211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13216, "tokens_per_second_per_gpu": 11127.76, "total_tokens": 1305042398 }, { "epoch": 0.8262690672668167, "grad_norm": 0.8949762582778931, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13217, "tokens_per_second_per_gpu": 11274.41, "total_tokens": 1305143537 }, { "epoch": 0.826331582895724, "grad_norm": 0.9123550057411194, "learning_rate": 2e-05, "loss": 0.5853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13218, "tokens_per_second_per_gpu": 10640.31, "total_tokens": 1305240165 }, { "epoch": 0.8263940985246312, "grad_norm": 0.9237088561058044, "learning_rate": 2e-05, "loss": 0.6294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13219, "tokens_per_second_per_gpu": 10866.65, "total_tokens": 1305338048 }, { "epoch": 0.8264566141535384, "grad_norm": 0.9188663363456726, "learning_rate": 2e-05, "loss": 0.6012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13220, "tokens_per_second_per_gpu": 10807.67, "total_tokens": 1305434923 }, { "epoch": 0.8265191297824456, "grad_norm": 0.8874789476394653, "learning_rate": 2e-05, "loss": 0.674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13221, "tokens_per_second_per_gpu": 11106.49, "total_tokens": 1305539281 }, { "epoch": 0.8265816454113528, "grad_norm": 0.9033361673355103, "learning_rate": 2e-05, "loss": 0.5793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13222, "tokens_per_second_per_gpu": 10435.93, "total_tokens": 1305638110 }, { "epoch": 0.8266441610402601, "grad_norm": 0.9109657406806946, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13223, "tokens_per_second_per_gpu": 10588.67, "total_tokens": 1305735709 }, { "epoch": 0.8267066766691673, "grad_norm": 0.9011284112930298, "learning_rate": 2e-05, "loss": 0.5785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13224, "tokens_per_second_per_gpu": 10606.83, "total_tokens": 1305831640 }, { "epoch": 0.8267691922980746, "grad_norm": 0.8647941946983337, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13225, "tokens_per_second_per_gpu": 11267.4, "total_tokens": 1305934313 }, { "epoch": 0.8268317079269818, "grad_norm": 0.9296772480010986, "learning_rate": 2e-05, "loss": 0.573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13226, "tokens_per_second_per_gpu": 10654.26, "total_tokens": 1306033106 }, { "epoch": 0.8268942235558889, "grad_norm": 0.886014997959137, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13227, "tokens_per_second_per_gpu": 10826.64, "total_tokens": 1306133790 }, { "epoch": 0.8269567391847962, "grad_norm": 1.021490454673767, "learning_rate": 2e-05, "loss": 0.6453, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13228, "tokens_per_second_per_gpu": 10462.66, "total_tokens": 1306234141 }, { "epoch": 0.8270192548137034, "grad_norm": 0.8472725749015808, "learning_rate": 2e-05, "loss": 0.5737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13229, "tokens_per_second_per_gpu": 10298.95, "total_tokens": 1306333122 }, { "epoch": 0.8270817704426107, "grad_norm": 0.8873118758201599, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13230, "tokens_per_second_per_gpu": 10672.03, "total_tokens": 1306435507 }, { "epoch": 0.8271442860715179, "grad_norm": 0.894171416759491, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13231, "tokens_per_second_per_gpu": 10249.37, "total_tokens": 1306534616 }, { "epoch": 0.8272068017004252, "grad_norm": 0.8640432953834534, "learning_rate": 2e-05, "loss": 0.6733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13232, "tokens_per_second_per_gpu": 10583.7, "total_tokens": 1306635276 }, { "epoch": 0.8272693173293323, "grad_norm": 0.9310866594314575, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13233, "tokens_per_second_per_gpu": 10583.27, "total_tokens": 1306738354 }, { "epoch": 0.8273318329582395, "grad_norm": 0.8569561243057251, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13234, "tokens_per_second_per_gpu": 10617.92, "total_tokens": 1306838370 }, { "epoch": 0.8273943485871468, "grad_norm": 0.8934925198554993, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13235, "tokens_per_second_per_gpu": 10249.45, "total_tokens": 1306934289 }, { "epoch": 0.827456864216054, "grad_norm": 0.8885307312011719, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13236, "tokens_per_second_per_gpu": 10318.38, "total_tokens": 1307033693 }, { "epoch": 0.8275193798449613, "grad_norm": 0.8798019289970398, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13237, "tokens_per_second_per_gpu": 10384.43, "total_tokens": 1307133946 }, { "epoch": 0.8275818954738685, "grad_norm": 0.9219107627868652, "learning_rate": 2e-05, "loss": 0.6294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13238, "tokens_per_second_per_gpu": 10693.65, "total_tokens": 1307232603 }, { "epoch": 0.8276444111027756, "grad_norm": 0.899770975112915, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13239, "tokens_per_second_per_gpu": 11407.2, "total_tokens": 1307331807 }, { "epoch": 0.8277069267316829, "grad_norm": 0.8867580890655518, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13240, "tokens_per_second_per_gpu": 10472.5, "total_tokens": 1307433209 }, { "epoch": 0.8277694423605901, "grad_norm": 0.8392825126647949, "learning_rate": 2e-05, "loss": 0.5907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13241, "tokens_per_second_per_gpu": 10567.05, "total_tokens": 1307536341 }, { "epoch": 0.8278319579894974, "grad_norm": 0.8708615303039551, "learning_rate": 2e-05, "loss": 0.5973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13242, "tokens_per_second_per_gpu": 10387.26, "total_tokens": 1307635397 }, { "epoch": 0.8278944736184046, "grad_norm": 0.8903076648712158, "learning_rate": 2e-05, "loss": 0.6564, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13243, "tokens_per_second_per_gpu": 10866.59, "total_tokens": 1307738479 }, { "epoch": 0.8279569892473119, "grad_norm": 0.8866399526596069, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13244, "tokens_per_second_per_gpu": 10981.81, "total_tokens": 1307835768 }, { "epoch": 0.828019504876219, "grad_norm": 0.9091995358467102, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13245, "tokens_per_second_per_gpu": 10606.7, "total_tokens": 1307936481 }, { "epoch": 0.8280820205051262, "grad_norm": 0.9367080926895142, "learning_rate": 2e-05, "loss": 0.5937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13246, "tokens_per_second_per_gpu": 9694.14, "total_tokens": 1308032220 }, { "epoch": 0.8281445361340335, "grad_norm": 0.9155134558677673, "learning_rate": 2e-05, "loss": 0.6449, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13247, "tokens_per_second_per_gpu": 10397.23, "total_tokens": 1308129979 }, { "epoch": 0.8282070517629407, "grad_norm": 0.8793189525604248, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13248, "tokens_per_second_per_gpu": 9997.57, "total_tokens": 1308227146 }, { "epoch": 0.828269567391848, "grad_norm": 0.8982915878295898, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13249, "tokens_per_second_per_gpu": 10329.21, "total_tokens": 1308327226 }, { "epoch": 0.8283320830207552, "grad_norm": 0.9438913464546204, "learning_rate": 2e-05, "loss": 0.5881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13250, "tokens_per_second_per_gpu": 9477.84, "total_tokens": 1308419873 }, { "epoch": 0.8283945986496624, "grad_norm": 0.8817198276519775, "learning_rate": 2e-05, "loss": 0.5951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13251, "tokens_per_second_per_gpu": 9933.08, "total_tokens": 1308519122 }, { "epoch": 0.8284571142785696, "grad_norm": 0.878688395023346, "learning_rate": 2e-05, "loss": 0.6247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13252, "tokens_per_second_per_gpu": 10803.88, "total_tokens": 1308620043 }, { "epoch": 0.8285196299074769, "grad_norm": 0.8766469955444336, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13253, "tokens_per_second_per_gpu": 10729.52, "total_tokens": 1308721315 }, { "epoch": 0.8285821455363841, "grad_norm": 0.9136219620704651, "learning_rate": 2e-05, "loss": 0.5981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13254, "tokens_per_second_per_gpu": 9797.12, "total_tokens": 1308816392 }, { "epoch": 0.8286446611652913, "grad_norm": 0.8770683407783508, "learning_rate": 2e-05, "loss": 0.5899, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13255, "tokens_per_second_per_gpu": 10258.73, "total_tokens": 1308918571 }, { "epoch": 0.8287071767941986, "grad_norm": 0.873695433139801, "learning_rate": 2e-05, "loss": 0.5993, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13256, "tokens_per_second_per_gpu": 10122.8, "total_tokens": 1309015540 }, { "epoch": 0.8287696924231058, "grad_norm": 0.8844853043556213, "learning_rate": 2e-05, "loss": 0.6146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13257, "tokens_per_second_per_gpu": 10658.13, "total_tokens": 1309113794 }, { "epoch": 0.828832208052013, "grad_norm": 0.8818608522415161, "learning_rate": 2e-05, "loss": 0.5783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13258, "tokens_per_second_per_gpu": 9921.39, "total_tokens": 1309208599 }, { "epoch": 0.8288947236809202, "grad_norm": 0.8607455492019653, "learning_rate": 2e-05, "loss": 0.6009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13259, "tokens_per_second_per_gpu": 10349.39, "total_tokens": 1309306183 }, { "epoch": 0.8289572393098275, "grad_norm": 0.9124440550804138, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13260, "tokens_per_second_per_gpu": 10924.02, "total_tokens": 1309404162 }, { "epoch": 0.8290197549387347, "grad_norm": 0.9061046242713928, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13261, "tokens_per_second_per_gpu": 10876.81, "total_tokens": 1309502166 }, { "epoch": 0.8290822705676419, "grad_norm": 0.8661479949951172, "learning_rate": 2e-05, "loss": 0.5746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13262, "tokens_per_second_per_gpu": 10560.93, "total_tokens": 1309598288 }, { "epoch": 0.8291447861965492, "grad_norm": 0.8741138577461243, "learning_rate": 2e-05, "loss": 0.5716, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13263, "tokens_per_second_per_gpu": 11021.47, "total_tokens": 1309696780 }, { "epoch": 0.8292073018254563, "grad_norm": 0.8980898261070251, "learning_rate": 2e-05, "loss": 0.6419, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13264, "tokens_per_second_per_gpu": 10261.74, "total_tokens": 1309795140 }, { "epoch": 0.8292698174543636, "grad_norm": 0.8961716890335083, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13265, "tokens_per_second_per_gpu": 9756.73, "total_tokens": 1309893398 }, { "epoch": 0.8293323330832708, "grad_norm": 0.9031710624694824, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13266, "tokens_per_second_per_gpu": 10072.46, "total_tokens": 1309987178 }, { "epoch": 0.8293948487121781, "grad_norm": 0.8999066948890686, "learning_rate": 2e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13267, "tokens_per_second_per_gpu": 10378.54, "total_tokens": 1310083780 }, { "epoch": 0.8294573643410853, "grad_norm": 0.8963892459869385, "learning_rate": 2e-05, "loss": 0.5898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13268, "tokens_per_second_per_gpu": 10841.71, "total_tokens": 1310184974 }, { "epoch": 0.8295198799699925, "grad_norm": 0.8127390742301941, "learning_rate": 2e-05, "loss": 0.5858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13269, "tokens_per_second_per_gpu": 11209.79, "total_tokens": 1310284612 }, { "epoch": 0.8295823955988997, "grad_norm": 0.8609074354171753, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13270, "tokens_per_second_per_gpu": 11055.53, "total_tokens": 1310386626 }, { "epoch": 0.8296449112278069, "grad_norm": 0.9092517495155334, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13271, "tokens_per_second_per_gpu": 9871.05, "total_tokens": 1310478493 }, { "epoch": 0.8297074268567142, "grad_norm": 0.8747894763946533, "learning_rate": 2e-05, "loss": 0.5875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13272, "tokens_per_second_per_gpu": 10180.49, "total_tokens": 1310576461 }, { "epoch": 0.8297699424856214, "grad_norm": 0.920075535774231, "learning_rate": 2e-05, "loss": 0.6002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13273, "tokens_per_second_per_gpu": 10957.28, "total_tokens": 1310675588 }, { "epoch": 0.8298324581145287, "grad_norm": 0.9309616684913635, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13274, "tokens_per_second_per_gpu": 10505.89, "total_tokens": 1310777255 }, { "epoch": 0.8298949737434359, "grad_norm": 0.8670039772987366, "learning_rate": 2e-05, "loss": 0.5841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13275, "tokens_per_second_per_gpu": 9780.38, "total_tokens": 1310873291 }, { "epoch": 0.829957489372343, "grad_norm": 0.8686147332191467, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13276, "tokens_per_second_per_gpu": 10723.63, "total_tokens": 1310976137 }, { "epoch": 0.8300200050012503, "grad_norm": 0.9164612293243408, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13277, "tokens_per_second_per_gpu": 10844.91, "total_tokens": 1311076713 }, { "epoch": 0.8300825206301575, "grad_norm": 0.8732979893684387, "learning_rate": 2e-05, "loss": 0.5928, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13278, "tokens_per_second_per_gpu": 10058.28, "total_tokens": 1311173881 }, { "epoch": 0.8301450362590648, "grad_norm": 0.8887156248092651, "learning_rate": 2e-05, "loss": 0.6435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13279, "tokens_per_second_per_gpu": 11204.14, "total_tokens": 1311275811 }, { "epoch": 0.830207551887972, "grad_norm": 0.8790819644927979, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13280, "tokens_per_second_per_gpu": 11281.38, "total_tokens": 1311377618 }, { "epoch": 0.8302700675168793, "grad_norm": 0.9204948544502258, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13281, "tokens_per_second_per_gpu": 10307.51, "total_tokens": 1311474233 }, { "epoch": 0.8303325831457864, "grad_norm": 0.8708820939064026, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13282, "tokens_per_second_per_gpu": 11070.09, "total_tokens": 1311576558 }, { "epoch": 0.8303950987746936, "grad_norm": 0.86665278673172, "learning_rate": 2e-05, "loss": 0.5843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13283, "tokens_per_second_per_gpu": 10230.95, "total_tokens": 1311670273 }, { "epoch": 0.8304576144036009, "grad_norm": 0.9125386476516724, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13284, "tokens_per_second_per_gpu": 10400.54, "total_tokens": 1311769602 }, { "epoch": 0.8305201300325081, "grad_norm": 0.8890613317489624, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13285, "tokens_per_second_per_gpu": 10333.37, "total_tokens": 1311869366 }, { "epoch": 0.8305826456614154, "grad_norm": 0.882940948009491, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13286, "tokens_per_second_per_gpu": 10760.51, "total_tokens": 1311965812 }, { "epoch": 0.8306451612903226, "grad_norm": 0.9221075177192688, "learning_rate": 2e-05, "loss": 0.6086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13287, "tokens_per_second_per_gpu": 10055.36, "total_tokens": 1312061720 }, { "epoch": 0.8307076769192298, "grad_norm": 0.9152158498764038, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13288, "tokens_per_second_per_gpu": 10861.03, "total_tokens": 1312163749 }, { "epoch": 0.830770192548137, "grad_norm": 0.8832345008850098, "learning_rate": 2e-05, "loss": 0.6348, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13289, "tokens_per_second_per_gpu": 10502.1, "total_tokens": 1312262005 }, { "epoch": 0.8308327081770442, "grad_norm": 0.9238438606262207, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13290, "tokens_per_second_per_gpu": 10263.15, "total_tokens": 1312362757 }, { "epoch": 0.8308952238059515, "grad_norm": 0.9029386639595032, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13291, "tokens_per_second_per_gpu": 10789.2, "total_tokens": 1312462657 }, { "epoch": 0.8309577394348587, "grad_norm": 0.9010618925094604, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13292, "tokens_per_second_per_gpu": 9762.54, "total_tokens": 1312559443 }, { "epoch": 0.831020255063766, "grad_norm": 0.9289340376853943, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13293, "tokens_per_second_per_gpu": 9996.71, "total_tokens": 1312654142 }, { "epoch": 0.8310827706926732, "grad_norm": 0.8834078311920166, "learning_rate": 2e-05, "loss": 0.5559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13294, "tokens_per_second_per_gpu": 9980.19, "total_tokens": 1312748555 }, { "epoch": 0.8311452863215804, "grad_norm": 0.9083647131919861, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13295, "tokens_per_second_per_gpu": 11178.53, "total_tokens": 1312852773 }, { "epoch": 0.8312078019504876, "grad_norm": 0.923284113407135, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13296, "tokens_per_second_per_gpu": 9431.9, "total_tokens": 1312947197 }, { "epoch": 0.8312703175793948, "grad_norm": 0.8896458745002747, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13297, "tokens_per_second_per_gpu": 10187.24, "total_tokens": 1313042046 }, { "epoch": 0.8313328332083021, "grad_norm": 0.8903273940086365, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13298, "tokens_per_second_per_gpu": 10248.43, "total_tokens": 1313140231 }, { "epoch": 0.8313953488372093, "grad_norm": 0.9161527156829834, "learning_rate": 2e-05, "loss": 0.6339, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13299, "tokens_per_second_per_gpu": 10598.38, "total_tokens": 1313238412 }, { "epoch": 0.8314578644661166, "grad_norm": 0.8670204877853394, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13300, "tokens_per_second_per_gpu": 10543.96, "total_tokens": 1313341015 }, { "epoch": 0.8315203800950237, "grad_norm": 0.8908977508544922, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13301, "tokens_per_second_per_gpu": 10709.88, "total_tokens": 1313437320 }, { "epoch": 0.831582895723931, "grad_norm": 0.9112862944602966, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13302, "tokens_per_second_per_gpu": 10205.92, "total_tokens": 1313534354 }, { "epoch": 0.8316454113528382, "grad_norm": 0.8803541660308838, "learning_rate": 2e-05, "loss": 0.6247, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13303, "tokens_per_second_per_gpu": 10622.69, "total_tokens": 1313633081 }, { "epoch": 0.8317079269817454, "grad_norm": 0.8787957429885864, "learning_rate": 2e-05, "loss": 0.6607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13304, "tokens_per_second_per_gpu": 10696.68, "total_tokens": 1313735465 }, { "epoch": 0.8317704426106527, "grad_norm": 0.8536677360534668, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13305, "tokens_per_second_per_gpu": 10947.44, "total_tokens": 1313838222 }, { "epoch": 0.8318329582395599, "grad_norm": 0.9055492877960205, "learning_rate": 2e-05, "loss": 0.6041, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13306, "tokens_per_second_per_gpu": 10254.8, "total_tokens": 1313935292 }, { "epoch": 0.8318954738684671, "grad_norm": 0.9952182769775391, "learning_rate": 2e-05, "loss": 0.6026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13307, "tokens_per_second_per_gpu": 10703.82, "total_tokens": 1314035316 }, { "epoch": 0.8319579894973743, "grad_norm": 0.8810675740242004, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13308, "tokens_per_second_per_gpu": 10618.02, "total_tokens": 1314135483 }, { "epoch": 0.8320205051262816, "grad_norm": 0.8823394775390625, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13309, "tokens_per_second_per_gpu": 10476.04, "total_tokens": 1314232714 }, { "epoch": 0.8320830207551888, "grad_norm": 0.8721358180046082, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13310, "tokens_per_second_per_gpu": 10970.51, "total_tokens": 1314337192 }, { "epoch": 0.832145536384096, "grad_norm": 0.9057504534721375, "learning_rate": 2e-05, "loss": 0.6287, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13311, "tokens_per_second_per_gpu": 10362.33, "total_tokens": 1314434260 }, { "epoch": 0.8322080520130033, "grad_norm": 0.919333815574646, "learning_rate": 2e-05, "loss": 0.6734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13312, "tokens_per_second_per_gpu": 11032.12, "total_tokens": 1314535837 }, { "epoch": 0.8322705676419104, "grad_norm": 0.8793916702270508, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13313, "tokens_per_second_per_gpu": 11093.41, "total_tokens": 1314637403 }, { "epoch": 0.8323330832708177, "grad_norm": 0.9076793789863586, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13314, "tokens_per_second_per_gpu": 10477.68, "total_tokens": 1314734763 }, { "epoch": 0.8323955988997249, "grad_norm": 0.835477352142334, "learning_rate": 2e-05, "loss": 0.5929, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13315, "tokens_per_second_per_gpu": 10616.26, "total_tokens": 1314835119 }, { "epoch": 0.8324581145286322, "grad_norm": 0.9057714343070984, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13316, "tokens_per_second_per_gpu": 10751.74, "total_tokens": 1314933886 }, { "epoch": 0.8325206301575394, "grad_norm": 0.9126867055892944, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13317, "tokens_per_second_per_gpu": 10297.25, "total_tokens": 1315030761 }, { "epoch": 0.8325831457864467, "grad_norm": 0.9049018025398254, "learning_rate": 2e-05, "loss": 0.6507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13318, "tokens_per_second_per_gpu": 10017.86, "total_tokens": 1315127986 }, { "epoch": 0.8326456614153538, "grad_norm": 0.8766126036643982, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13319, "tokens_per_second_per_gpu": 11285.42, "total_tokens": 1315225499 }, { "epoch": 0.832708177044261, "grad_norm": 0.8752185106277466, "learning_rate": 2e-05, "loss": 0.6041, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13320, "tokens_per_second_per_gpu": 9911.03, "total_tokens": 1315327229 }, { "epoch": 0.8327706926731683, "grad_norm": 0.9434369206428528, "learning_rate": 2e-05, "loss": 0.5986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13321, "tokens_per_second_per_gpu": 13006.39, "total_tokens": 1315426072 }, { "epoch": 0.8328332083020755, "grad_norm": 0.8992043733596802, "learning_rate": 2e-05, "loss": 0.6468, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13322, "tokens_per_second_per_gpu": 11912.42, "total_tokens": 1315527592 }, { "epoch": 0.8328957239309828, "grad_norm": 0.8980743288993835, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13323, "tokens_per_second_per_gpu": 9287.6, "total_tokens": 1315625797 }, { "epoch": 0.83295823955989, "grad_norm": 0.8896728754043579, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13324, "tokens_per_second_per_gpu": 10374.87, "total_tokens": 1315726718 }, { "epoch": 0.8330207551887971, "grad_norm": 0.877592146396637, "learning_rate": 2e-05, "loss": 0.5959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13325, "tokens_per_second_per_gpu": 11588.02, "total_tokens": 1315827298 }, { "epoch": 0.8330832708177044, "grad_norm": 0.9145923256874084, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13326, "tokens_per_second_per_gpu": 10366.89, "total_tokens": 1315921922 }, { "epoch": 0.8331457864466116, "grad_norm": 0.8632796406745911, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13327, "tokens_per_second_per_gpu": 11009.76, "total_tokens": 1316022671 }, { "epoch": 0.8332083020755189, "grad_norm": 0.8895304203033447, "learning_rate": 2e-05, "loss": 0.5776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13328, "tokens_per_second_per_gpu": 9887.57, "total_tokens": 1316114369 }, { "epoch": 0.8332708177044261, "grad_norm": 0.9236732721328735, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13329, "tokens_per_second_per_gpu": 9829.37, "total_tokens": 1316209157 }, { "epoch": 0.8333333333333334, "grad_norm": 0.911371111869812, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13330, "tokens_per_second_per_gpu": 10523.68, "total_tokens": 1316303852 }, { "epoch": 0.8333958489622406, "grad_norm": 0.8950140476226807, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13331, "tokens_per_second_per_gpu": 10803.95, "total_tokens": 1316404904 }, { "epoch": 0.8334583645911477, "grad_norm": 0.9125170707702637, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13332, "tokens_per_second_per_gpu": 10182.12, "total_tokens": 1316499890 }, { "epoch": 0.833520880220055, "grad_norm": 0.880891740322113, "learning_rate": 2e-05, "loss": 0.6661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13333, "tokens_per_second_per_gpu": 10466.25, "total_tokens": 1316601567 }, { "epoch": 0.8335833958489622, "grad_norm": 0.8855836391448975, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13334, "tokens_per_second_per_gpu": 11179.75, "total_tokens": 1316704917 }, { "epoch": 0.8336459114778695, "grad_norm": 0.8944646120071411, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13335, "tokens_per_second_per_gpu": 10602.79, "total_tokens": 1316803051 }, { "epoch": 0.8337084271067767, "grad_norm": 0.9038159251213074, "learning_rate": 2e-05, "loss": 0.6837, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13336, "tokens_per_second_per_gpu": 10948.5, "total_tokens": 1316906583 }, { "epoch": 0.833770942735684, "grad_norm": 0.9181434512138367, "learning_rate": 2e-05, "loss": 0.6477, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13337, "tokens_per_second_per_gpu": 10194.37, "total_tokens": 1317004403 }, { "epoch": 0.8338334583645911, "grad_norm": 0.880037248134613, "learning_rate": 2e-05, "loss": 0.5525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13338, "tokens_per_second_per_gpu": 9817.57, "total_tokens": 1317096582 }, { "epoch": 0.8338959739934984, "grad_norm": 0.885867178440094, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13339, "tokens_per_second_per_gpu": 10761.83, "total_tokens": 1317200165 }, { "epoch": 0.8339584896224056, "grad_norm": 0.9067394137382507, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13340, "tokens_per_second_per_gpu": 9814.38, "total_tokens": 1317297285 }, { "epoch": 0.8340210052513128, "grad_norm": 0.9275997877120972, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13341, "tokens_per_second_per_gpu": 10265.37, "total_tokens": 1317395654 }, { "epoch": 0.8340835208802201, "grad_norm": 0.9321606755256653, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13342, "tokens_per_second_per_gpu": 9686.94, "total_tokens": 1317491223 }, { "epoch": 0.8341460365091273, "grad_norm": 0.8992909789085388, "learning_rate": 2e-05, "loss": 0.6509, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13343, "tokens_per_second_per_gpu": 10007.4, "total_tokens": 1317591808 }, { "epoch": 0.8342085521380345, "grad_norm": 0.897794783115387, "learning_rate": 2e-05, "loss": 0.6333, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13344, "tokens_per_second_per_gpu": 10013.11, "total_tokens": 1317689828 }, { "epoch": 0.8342710677669417, "grad_norm": 0.8975778818130493, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13345, "tokens_per_second_per_gpu": 10181.88, "total_tokens": 1317786038 }, { "epoch": 0.834333583395849, "grad_norm": 0.8959968686103821, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13346, "tokens_per_second_per_gpu": 9923.92, "total_tokens": 1317881744 }, { "epoch": 0.8343960990247562, "grad_norm": 0.9070702195167542, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13347, "tokens_per_second_per_gpu": 10296.46, "total_tokens": 1317977975 }, { "epoch": 0.8344586146536634, "grad_norm": 0.9248968958854675, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13348, "tokens_per_second_per_gpu": 9831.18, "total_tokens": 1318074291 }, { "epoch": 0.8345211302825707, "grad_norm": 0.8719033598899841, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13349, "tokens_per_second_per_gpu": 11500.34, "total_tokens": 1318175653 }, { "epoch": 0.8345836459114778, "grad_norm": 0.844760537147522, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13350, "tokens_per_second_per_gpu": 10922.56, "total_tokens": 1318277317 }, { "epoch": 0.8346461615403851, "grad_norm": 0.9246121644973755, "learning_rate": 2e-05, "loss": 0.6708, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13351, "tokens_per_second_per_gpu": 11698.55, "total_tokens": 1318378513 }, { "epoch": 0.8347086771692923, "grad_norm": 0.8982869386672974, "learning_rate": 2e-05, "loss": 0.6763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13352, "tokens_per_second_per_gpu": 10101.55, "total_tokens": 1318477014 }, { "epoch": 0.8347711927981996, "grad_norm": 0.8811835646629333, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13353, "tokens_per_second_per_gpu": 10413.86, "total_tokens": 1318575274 }, { "epoch": 0.8348337084271068, "grad_norm": 0.8974056243896484, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13354, "tokens_per_second_per_gpu": 10240.67, "total_tokens": 1318670710 }, { "epoch": 0.834896224056014, "grad_norm": 0.8861449956893921, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13355, "tokens_per_second_per_gpu": 10933.74, "total_tokens": 1318770803 }, { "epoch": 0.8349587396849212, "grad_norm": 0.8863227367401123, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13356, "tokens_per_second_per_gpu": 10290.82, "total_tokens": 1318865733 }, { "epoch": 0.8350212553138284, "grad_norm": 0.8869077563285828, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13357, "tokens_per_second_per_gpu": 10706.17, "total_tokens": 1318964099 }, { "epoch": 0.8350837709427357, "grad_norm": 0.8717397451400757, "learning_rate": 2e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13358, "tokens_per_second_per_gpu": 10535.11, "total_tokens": 1319061405 }, { "epoch": 0.8351462865716429, "grad_norm": 0.8650019764900208, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13359, "tokens_per_second_per_gpu": 10357.62, "total_tokens": 1319159154 }, { "epoch": 0.8352088022005502, "grad_norm": 0.9030314087867737, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13360, "tokens_per_second_per_gpu": 9978.74, "total_tokens": 1319257000 }, { "epoch": 0.8352713178294574, "grad_norm": 0.8762553930282593, "learning_rate": 2e-05, "loss": 0.5841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13361, "tokens_per_second_per_gpu": 10243.75, "total_tokens": 1319349280 }, { "epoch": 0.8353338334583645, "grad_norm": 0.8706739544868469, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13362, "tokens_per_second_per_gpu": 10249.6, "total_tokens": 1319448173 }, { "epoch": 0.8353963490872718, "grad_norm": 0.9263964891433716, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13363, "tokens_per_second_per_gpu": 10465.98, "total_tokens": 1319542955 }, { "epoch": 0.835458864716179, "grad_norm": 0.8837507963180542, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13364, "tokens_per_second_per_gpu": 10854.74, "total_tokens": 1319641582 }, { "epoch": 0.8355213803450863, "grad_norm": 0.8994787931442261, "learning_rate": 2e-05, "loss": 0.652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13365, "tokens_per_second_per_gpu": 10398.64, "total_tokens": 1319741553 }, { "epoch": 0.8355838959739935, "grad_norm": 0.894803524017334, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13366, "tokens_per_second_per_gpu": 10249.58, "total_tokens": 1319837882 }, { "epoch": 0.8356464116029008, "grad_norm": 0.9271476864814758, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13367, "tokens_per_second_per_gpu": 9751.24, "total_tokens": 1319931047 }, { "epoch": 0.8357089272318079, "grad_norm": 0.9326974749565125, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13368, "tokens_per_second_per_gpu": 10449.7, "total_tokens": 1320032055 }, { "epoch": 0.8357714428607151, "grad_norm": 0.8984860181808472, "learning_rate": 2e-05, "loss": 0.5883, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13369, "tokens_per_second_per_gpu": 10536.53, "total_tokens": 1320126473 }, { "epoch": 0.8358339584896224, "grad_norm": 0.9282241463661194, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13370, "tokens_per_second_per_gpu": 10468.3, "total_tokens": 1320220668 }, { "epoch": 0.8358964741185296, "grad_norm": 0.8835962414741516, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13371, "tokens_per_second_per_gpu": 10267.72, "total_tokens": 1320319175 }, { "epoch": 0.8359589897474369, "grad_norm": 0.8614303469657898, "learning_rate": 2e-05, "loss": 0.6338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13372, "tokens_per_second_per_gpu": 10620.76, "total_tokens": 1320423401 }, { "epoch": 0.8360215053763441, "grad_norm": 0.8783655762672424, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13373, "tokens_per_second_per_gpu": 10324.48, "total_tokens": 1320518343 }, { "epoch": 0.8360840210052514, "grad_norm": 0.8802858591079712, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13374, "tokens_per_second_per_gpu": 10903.49, "total_tokens": 1320620251 }, { "epoch": 0.8361465366341585, "grad_norm": 0.9850273728370667, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13375, "tokens_per_second_per_gpu": 10007.26, "total_tokens": 1320715282 }, { "epoch": 0.8362090522630657, "grad_norm": 0.8905729055404663, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13376, "tokens_per_second_per_gpu": 9991.08, "total_tokens": 1320809041 }, { "epoch": 0.836271567891973, "grad_norm": 0.8469846844673157, "learning_rate": 2e-05, "loss": 0.5926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13377, "tokens_per_second_per_gpu": 10779.63, "total_tokens": 1320907638 }, { "epoch": 0.8363340835208802, "grad_norm": 0.8791913986206055, "learning_rate": 2e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13378, "tokens_per_second_per_gpu": 10533.77, "total_tokens": 1321006744 }, { "epoch": 0.8363965991497875, "grad_norm": 0.8750864863395691, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13379, "tokens_per_second_per_gpu": 10602.59, "total_tokens": 1321105605 }, { "epoch": 0.8364591147786947, "grad_norm": 0.9136403203010559, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13380, "tokens_per_second_per_gpu": 10016.56, "total_tokens": 1321202368 }, { "epoch": 0.8365216304076019, "grad_norm": 0.8839870691299438, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13381, "tokens_per_second_per_gpu": 9888.04, "total_tokens": 1321302043 }, { "epoch": 0.8365841460365091, "grad_norm": 0.8915051817893982, "learning_rate": 2e-05, "loss": 0.5972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13382, "tokens_per_second_per_gpu": 9685.21, "total_tokens": 1321395600 }, { "epoch": 0.8366466616654163, "grad_norm": 0.9291285872459412, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13383, "tokens_per_second_per_gpu": 11038.2, "total_tokens": 1321490509 }, { "epoch": 0.8367091772943236, "grad_norm": 0.9224148988723755, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13384, "tokens_per_second_per_gpu": 10152.62, "total_tokens": 1321581642 }, { "epoch": 0.8367716929232308, "grad_norm": 0.8921064138412476, "learning_rate": 2e-05, "loss": 0.6669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13385, "tokens_per_second_per_gpu": 10923.75, "total_tokens": 1321683065 }, { "epoch": 0.8368342085521381, "grad_norm": 0.854211688041687, "learning_rate": 2e-05, "loss": 0.5827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13386, "tokens_per_second_per_gpu": 10232.0, "total_tokens": 1321779520 }, { "epoch": 0.8368967241810452, "grad_norm": 0.9113577008247375, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13387, "tokens_per_second_per_gpu": 10014.49, "total_tokens": 1321878857 }, { "epoch": 0.8369592398099525, "grad_norm": 0.9025017023086548, "learning_rate": 2e-05, "loss": 0.5851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13388, "tokens_per_second_per_gpu": 10788.42, "total_tokens": 1321973190 }, { "epoch": 0.8370217554388597, "grad_norm": 0.9108360409736633, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13389, "tokens_per_second_per_gpu": 10360.35, "total_tokens": 1322071176 }, { "epoch": 0.837084271067767, "grad_norm": 0.8946496844291687, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13390, "tokens_per_second_per_gpu": 10468.3, "total_tokens": 1322168408 }, { "epoch": 0.8371467866966742, "grad_norm": 0.903156578540802, "learning_rate": 2e-05, "loss": 0.5945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13391, "tokens_per_second_per_gpu": 9959.28, "total_tokens": 1322260107 }, { "epoch": 0.8372093023255814, "grad_norm": 0.895336925983429, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13392, "tokens_per_second_per_gpu": 10728.87, "total_tokens": 1322358321 }, { "epoch": 0.8372718179544886, "grad_norm": 0.882594645023346, "learning_rate": 2e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13393, "tokens_per_second_per_gpu": 10443.33, "total_tokens": 1322453537 }, { "epoch": 0.8373343335833958, "grad_norm": 0.90716552734375, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13394, "tokens_per_second_per_gpu": 10551.41, "total_tokens": 1322552306 }, { "epoch": 0.8373968492123031, "grad_norm": 0.8642174601554871, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13395, "tokens_per_second_per_gpu": 10073.59, "total_tokens": 1322650068 }, { "epoch": 0.8374593648412103, "grad_norm": 0.870534360408783, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13396, "tokens_per_second_per_gpu": 11342.97, "total_tokens": 1322752950 }, { "epoch": 0.8375218804701176, "grad_norm": 0.9107248783111572, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13397, "tokens_per_second_per_gpu": 10294.28, "total_tokens": 1322851602 }, { "epoch": 0.8375843960990248, "grad_norm": 0.8935804963111877, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13398, "tokens_per_second_per_gpu": 10240.06, "total_tokens": 1322947936 }, { "epoch": 0.8376469117279319, "grad_norm": 0.9342385530471802, "learning_rate": 2e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13399, "tokens_per_second_per_gpu": 10160.36, "total_tokens": 1323042337 }, { "epoch": 0.8377094273568392, "grad_norm": 0.898124635219574, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13400, "tokens_per_second_per_gpu": 10166.78, "total_tokens": 1323133349 }, { "epoch": 0.8377719429857464, "grad_norm": 0.8917257785797119, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13401, "tokens_per_second_per_gpu": 10288.04, "total_tokens": 1323233044 }, { "epoch": 0.8378344586146537, "grad_norm": 0.8860315680503845, "learning_rate": 2e-05, "loss": 0.6759, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13402, "tokens_per_second_per_gpu": 10706.37, "total_tokens": 1323334413 }, { "epoch": 0.8378969742435609, "grad_norm": 0.9978578686714172, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13403, "tokens_per_second_per_gpu": 10354.56, "total_tokens": 1323429013 }, { "epoch": 0.8379594898724682, "grad_norm": 0.9009940028190613, "learning_rate": 2e-05, "loss": 0.5967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13404, "tokens_per_second_per_gpu": 10086.71, "total_tokens": 1323522209 }, { "epoch": 0.8380220055013753, "grad_norm": 0.9081056118011475, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13405, "tokens_per_second_per_gpu": 10215.33, "total_tokens": 1323615932 }, { "epoch": 0.8380845211302825, "grad_norm": 0.8786410689353943, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13406, "tokens_per_second_per_gpu": 11137.35, "total_tokens": 1323713756 }, { "epoch": 0.8381470367591898, "grad_norm": 0.8703792095184326, "learning_rate": 2e-05, "loss": 0.5524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13407, "tokens_per_second_per_gpu": 9736.13, "total_tokens": 1323806824 }, { "epoch": 0.838209552388097, "grad_norm": 0.9174602627754211, "learning_rate": 2e-05, "loss": 0.5945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13408, "tokens_per_second_per_gpu": 10145.91, "total_tokens": 1323903127 }, { "epoch": 0.8382720680170043, "grad_norm": 0.9430630803108215, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13409, "tokens_per_second_per_gpu": 9978.71, "total_tokens": 1324000655 }, { "epoch": 0.8383345836459115, "grad_norm": 0.8810876607894897, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13410, "tokens_per_second_per_gpu": 10253.99, "total_tokens": 1324100285 }, { "epoch": 0.8383970992748188, "grad_norm": 0.8844596743583679, "learning_rate": 2e-05, "loss": 0.6059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13411, "tokens_per_second_per_gpu": 10278.08, "total_tokens": 1324196231 }, { "epoch": 0.8384596149037259, "grad_norm": 0.8877901434898376, "learning_rate": 2e-05, "loss": 0.5834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13412, "tokens_per_second_per_gpu": 10294.37, "total_tokens": 1324290191 }, { "epoch": 0.8385221305326331, "grad_norm": 0.9187301397323608, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13413, "tokens_per_second_per_gpu": 10826.23, "total_tokens": 1324389063 }, { "epoch": 0.8385846461615404, "grad_norm": 0.9190017580986023, "learning_rate": 2e-05, "loss": 0.5816, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13414, "tokens_per_second_per_gpu": 10394.77, "total_tokens": 1324484810 }, { "epoch": 0.8386471617904476, "grad_norm": 0.903743326663971, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13415, "tokens_per_second_per_gpu": 10051.6, "total_tokens": 1324580437 }, { "epoch": 0.8387096774193549, "grad_norm": 0.8996389508247375, "learning_rate": 2e-05, "loss": 0.6195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13416, "tokens_per_second_per_gpu": 10413.96, "total_tokens": 1324679531 }, { "epoch": 0.8387721930482621, "grad_norm": 0.9011680483818054, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13417, "tokens_per_second_per_gpu": 11029.2, "total_tokens": 1324783145 }, { "epoch": 0.8388347086771692, "grad_norm": 0.8955127596855164, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13418, "tokens_per_second_per_gpu": 10119.51, "total_tokens": 1324879537 }, { "epoch": 0.8388972243060765, "grad_norm": 0.9228368997573853, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13419, "tokens_per_second_per_gpu": 10858.14, "total_tokens": 1324975448 }, { "epoch": 0.8389597399349837, "grad_norm": 0.8937057852745056, "learning_rate": 2e-05, "loss": 0.5925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13420, "tokens_per_second_per_gpu": 9732.44, "total_tokens": 1325070891 }, { "epoch": 0.839022255563891, "grad_norm": 0.9193075299263, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13421, "tokens_per_second_per_gpu": 10422.09, "total_tokens": 1325163670 }, { "epoch": 0.8390847711927982, "grad_norm": 0.865226686000824, "learning_rate": 2e-05, "loss": 0.5839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13422, "tokens_per_second_per_gpu": 9874.9, "total_tokens": 1325261782 }, { "epoch": 0.8391472868217055, "grad_norm": 0.9295667409896851, "learning_rate": 2e-05, "loss": 0.5956, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13423, "tokens_per_second_per_gpu": 9331.65, "total_tokens": 1325354989 }, { "epoch": 0.8392098024506126, "grad_norm": 0.9024310111999512, "learning_rate": 2e-05, "loss": 0.6414, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13424, "tokens_per_second_per_gpu": 10979.05, "total_tokens": 1325455199 }, { "epoch": 0.8392723180795199, "grad_norm": 0.8913702368736267, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13425, "tokens_per_second_per_gpu": 11021.16, "total_tokens": 1325553314 }, { "epoch": 0.8393348337084271, "grad_norm": 0.9220178723335266, "learning_rate": 2e-05, "loss": 0.5723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13426, "tokens_per_second_per_gpu": 9826.81, "total_tokens": 1325641378 }, { "epoch": 0.8393973493373343, "grad_norm": 0.946892261505127, "learning_rate": 2e-05, "loss": 0.701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13427, "tokens_per_second_per_gpu": 10779.2, "total_tokens": 1325740382 }, { "epoch": 0.8394598649662416, "grad_norm": 0.9512506127357483, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13428, "tokens_per_second_per_gpu": 9828.74, "total_tokens": 1325834070 }, { "epoch": 0.8395223805951488, "grad_norm": 0.8919453024864197, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13429, "tokens_per_second_per_gpu": 10640.35, "total_tokens": 1325930945 }, { "epoch": 0.839584896224056, "grad_norm": 0.8715965747833252, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13430, "tokens_per_second_per_gpu": 9828.02, "total_tokens": 1326028251 }, { "epoch": 0.8396474118529632, "grad_norm": 0.9228979349136353, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13431, "tokens_per_second_per_gpu": 9652.3, "total_tokens": 1326123806 }, { "epoch": 0.8397099274818705, "grad_norm": 0.8748829364776611, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13432, "tokens_per_second_per_gpu": 10317.57, "total_tokens": 1326219921 }, { "epoch": 0.8397724431107777, "grad_norm": 0.8974685072898865, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13433, "tokens_per_second_per_gpu": 10087.46, "total_tokens": 1326319545 }, { "epoch": 0.839834958739685, "grad_norm": 0.9097645878791809, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13434, "tokens_per_second_per_gpu": 9920.92, "total_tokens": 1326415947 }, { "epoch": 0.8398974743685922, "grad_norm": 0.8887441158294678, "learning_rate": 2e-05, "loss": 0.5732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13435, "tokens_per_second_per_gpu": 10109.45, "total_tokens": 1326508440 }, { "epoch": 0.8399599899974993, "grad_norm": 0.9053502082824707, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13436, "tokens_per_second_per_gpu": 10180.66, "total_tokens": 1326604656 }, { "epoch": 0.8400225056264066, "grad_norm": 0.9234962463378906, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13437, "tokens_per_second_per_gpu": 9972.91, "total_tokens": 1326702264 }, { "epoch": 0.8400850212553138, "grad_norm": 0.875373363494873, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13438, "tokens_per_second_per_gpu": 10412.45, "total_tokens": 1326802635 }, { "epoch": 0.8401475368842211, "grad_norm": 0.9254193902015686, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13439, "tokens_per_second_per_gpu": 10250.61, "total_tokens": 1326896246 }, { "epoch": 0.8402100525131283, "grad_norm": 0.9084663391113281, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13440, "tokens_per_second_per_gpu": 10385.88, "total_tokens": 1326992590 }, { "epoch": 0.8402725681420355, "grad_norm": 0.9230947494506836, "learning_rate": 2e-05, "loss": 0.6484, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13441, "tokens_per_second_per_gpu": 10298.82, "total_tokens": 1327093361 }, { "epoch": 0.8403350837709427, "grad_norm": 0.8665649890899658, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13442, "tokens_per_second_per_gpu": 10810.93, "total_tokens": 1327195656 }, { "epoch": 0.8403975993998499, "grad_norm": 0.8958004117012024, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13443, "tokens_per_second_per_gpu": 11240.96, "total_tokens": 1327295273 }, { "epoch": 0.8404601150287572, "grad_norm": 0.9099698066711426, "learning_rate": 2e-05, "loss": 0.6788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13444, "tokens_per_second_per_gpu": 10311.54, "total_tokens": 1327394911 }, { "epoch": 0.8405226306576644, "grad_norm": 0.8889120817184448, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13445, "tokens_per_second_per_gpu": 10439.58, "total_tokens": 1327491882 }, { "epoch": 0.8405851462865717, "grad_norm": 0.8708623647689819, "learning_rate": 2e-05, "loss": 0.5726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13446, "tokens_per_second_per_gpu": 10344.25, "total_tokens": 1327586776 }, { "epoch": 0.8406476619154789, "grad_norm": 0.8954371809959412, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13447, "tokens_per_second_per_gpu": 9638.59, "total_tokens": 1327681286 }, { "epoch": 0.8407101775443862, "grad_norm": 0.8953920006752014, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13448, "tokens_per_second_per_gpu": 9692.1, "total_tokens": 1327778199 }, { "epoch": 0.8407726931732933, "grad_norm": 0.8629385232925415, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13449, "tokens_per_second_per_gpu": 11013.65, "total_tokens": 1327880506 }, { "epoch": 0.8408352088022005, "grad_norm": 0.8868728280067444, "learning_rate": 2e-05, "loss": 0.5919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13450, "tokens_per_second_per_gpu": 9798.08, "total_tokens": 1327975945 }, { "epoch": 0.8408977244311078, "grad_norm": 0.8823872208595276, "learning_rate": 2e-05, "loss": 0.569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13451, "tokens_per_second_per_gpu": 9910.6, "total_tokens": 1328071683 }, { "epoch": 0.840960240060015, "grad_norm": 0.9037289619445801, "learning_rate": 2e-05, "loss": 0.5881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13452, "tokens_per_second_per_gpu": 9985.79, "total_tokens": 1328166250 }, { "epoch": 0.8410227556889223, "grad_norm": 0.9011433124542236, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13453, "tokens_per_second_per_gpu": 10190.05, "total_tokens": 1328265162 }, { "epoch": 0.8410852713178295, "grad_norm": 0.9015330672264099, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13454, "tokens_per_second_per_gpu": 10741.85, "total_tokens": 1328362012 }, { "epoch": 0.8411477869467366, "grad_norm": 0.8648087382316589, "learning_rate": 2e-05, "loss": 0.574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13455, "tokens_per_second_per_gpu": 10622.44, "total_tokens": 1328457987 }, { "epoch": 0.8412103025756439, "grad_norm": 0.88298100233078, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13456, "tokens_per_second_per_gpu": 10511.48, "total_tokens": 1328551638 }, { "epoch": 0.8412728182045511, "grad_norm": 0.8829026818275452, "learning_rate": 2e-05, "loss": 0.6026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13457, "tokens_per_second_per_gpu": 10051.32, "total_tokens": 1328646293 }, { "epoch": 0.8413353338334584, "grad_norm": 0.9326271414756775, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13458, "tokens_per_second_per_gpu": 10603.23, "total_tokens": 1328740197 }, { "epoch": 0.8413978494623656, "grad_norm": 0.902046799659729, "learning_rate": 2e-05, "loss": 0.6095, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13459, "tokens_per_second_per_gpu": 9779.93, "total_tokens": 1328832198 }, { "epoch": 0.8414603650912729, "grad_norm": 0.869389533996582, "learning_rate": 2e-05, "loss": 0.5699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13460, "tokens_per_second_per_gpu": 10065.66, "total_tokens": 1328926417 }, { "epoch": 0.84152288072018, "grad_norm": 0.8852807283401489, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13461, "tokens_per_second_per_gpu": 10112.69, "total_tokens": 1329026270 }, { "epoch": 0.8415853963490872, "grad_norm": 0.8952036499977112, "learning_rate": 2e-05, "loss": 0.6218, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13462, "tokens_per_second_per_gpu": 10641.45, "total_tokens": 1329123114 }, { "epoch": 0.8416479119779945, "grad_norm": 0.9193869829177856, "learning_rate": 2e-05, "loss": 0.5959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13463, "tokens_per_second_per_gpu": 9853.88, "total_tokens": 1329218277 }, { "epoch": 0.8417104276069017, "grad_norm": 0.8931252956390381, "learning_rate": 2e-05, "loss": 0.5823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13464, "tokens_per_second_per_gpu": 10380.32, "total_tokens": 1329314241 }, { "epoch": 0.841772943235809, "grad_norm": 0.8667293190956116, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13465, "tokens_per_second_per_gpu": 10498.45, "total_tokens": 1329410948 }, { "epoch": 0.8418354588647162, "grad_norm": 0.8821680545806885, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13466, "tokens_per_second_per_gpu": 10384.32, "total_tokens": 1329510400 }, { "epoch": 0.8418979744936234, "grad_norm": 0.8732125163078308, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13467, "tokens_per_second_per_gpu": 10710.2, "total_tokens": 1329607498 }, { "epoch": 0.8419604901225306, "grad_norm": 0.8939197659492493, "learning_rate": 2e-05, "loss": 0.6527, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13468, "tokens_per_second_per_gpu": 10041.29, "total_tokens": 1329705999 }, { "epoch": 0.8420230057514378, "grad_norm": 0.8829943537712097, "learning_rate": 2e-05, "loss": 0.5923, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13469, "tokens_per_second_per_gpu": 9835.56, "total_tokens": 1329804392 }, { "epoch": 0.8420855213803451, "grad_norm": 0.9129197001457214, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13470, "tokens_per_second_per_gpu": 10162.13, "total_tokens": 1329902986 }, { "epoch": 0.8421480370092523, "grad_norm": 0.8835949897766113, "learning_rate": 2e-05, "loss": 0.5774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13471, "tokens_per_second_per_gpu": 9939.29, "total_tokens": 1329999015 }, { "epoch": 0.8422105526381596, "grad_norm": 0.8777899146080017, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13472, "tokens_per_second_per_gpu": 11011.76, "total_tokens": 1330098009 }, { "epoch": 0.8422730682670667, "grad_norm": 0.8923237323760986, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13473, "tokens_per_second_per_gpu": 10220.44, "total_tokens": 1330189885 }, { "epoch": 0.842335583895974, "grad_norm": 0.8786696195602417, "learning_rate": 2e-05, "loss": 0.6065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13474, "tokens_per_second_per_gpu": 10711.5, "total_tokens": 1330286439 }, { "epoch": 0.8423980995248812, "grad_norm": 0.9948433041572571, "learning_rate": 2e-05, "loss": 0.6283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13475, "tokens_per_second_per_gpu": 10950.49, "total_tokens": 1330385345 }, { "epoch": 0.8424606151537885, "grad_norm": 0.9023364782333374, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13476, "tokens_per_second_per_gpu": 9951.09, "total_tokens": 1330483882 }, { "epoch": 0.8425231307826957, "grad_norm": 0.8818504214286804, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13477, "tokens_per_second_per_gpu": 10501.21, "total_tokens": 1330583250 }, { "epoch": 0.8425856464116029, "grad_norm": 0.9438098669052124, "learning_rate": 2e-05, "loss": 0.6357, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13478, "tokens_per_second_per_gpu": 11037.82, "total_tokens": 1330684356 }, { "epoch": 0.8426481620405101, "grad_norm": 0.8796148300170898, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13479, "tokens_per_second_per_gpu": 10702.77, "total_tokens": 1330780057 }, { "epoch": 0.8427106776694173, "grad_norm": 0.9305353760719299, "learning_rate": 2e-05, "loss": 0.5984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13480, "tokens_per_second_per_gpu": 10065.71, "total_tokens": 1330876230 }, { "epoch": 0.8427731932983246, "grad_norm": 0.9026481509208679, "learning_rate": 2e-05, "loss": 0.6244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13481, "tokens_per_second_per_gpu": 10408.88, "total_tokens": 1330975285 }, { "epoch": 0.8428357089272318, "grad_norm": 0.8857496976852417, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13482, "tokens_per_second_per_gpu": 10460.06, "total_tokens": 1331073902 }, { "epoch": 0.842898224556139, "grad_norm": 0.9056264162063599, "learning_rate": 2e-05, "loss": 0.6615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13483, "tokens_per_second_per_gpu": 10650.2, "total_tokens": 1331173524 }, { "epoch": 0.8429607401850463, "grad_norm": 0.9068964719772339, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13484, "tokens_per_second_per_gpu": 10507.02, "total_tokens": 1331270355 }, { "epoch": 0.8430232558139535, "grad_norm": 0.892900288105011, "learning_rate": 2e-05, "loss": 0.6059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13485, "tokens_per_second_per_gpu": 10520.6, "total_tokens": 1331369211 }, { "epoch": 0.8430857714428607, "grad_norm": 0.8791045546531677, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13486, "tokens_per_second_per_gpu": 10703.61, "total_tokens": 1331470405 }, { "epoch": 0.8431482870717679, "grad_norm": 0.9202025532722473, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13487, "tokens_per_second_per_gpu": 10712.74, "total_tokens": 1331569476 }, { "epoch": 0.8432108027006752, "grad_norm": 0.9022576212882996, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13488, "tokens_per_second_per_gpu": 10294.21, "total_tokens": 1331667191 }, { "epoch": 0.8432733183295824, "grad_norm": 0.8749738931655884, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13489, "tokens_per_second_per_gpu": 10487.84, "total_tokens": 1331764736 }, { "epoch": 0.8433358339584897, "grad_norm": 0.9427692890167236, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13490, "tokens_per_second_per_gpu": 9196.27, "total_tokens": 1331856815 }, { "epoch": 0.8433983495873969, "grad_norm": 0.9028874635696411, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13491, "tokens_per_second_per_gpu": 10230.99, "total_tokens": 1331953212 }, { "epoch": 0.843460865216304, "grad_norm": 0.935427188873291, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13492, "tokens_per_second_per_gpu": 10749.94, "total_tokens": 1332051507 }, { "epoch": 0.8435233808452113, "grad_norm": 0.913995623588562, "learning_rate": 2e-05, "loss": 0.5636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13493, "tokens_per_second_per_gpu": 9454.85, "total_tokens": 1332142676 }, { "epoch": 0.8435858964741185, "grad_norm": 0.9135915637016296, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13494, "tokens_per_second_per_gpu": 10241.97, "total_tokens": 1332241390 }, { "epoch": 0.8436484121030258, "grad_norm": 0.9126525521278381, "learning_rate": 2e-05, "loss": 0.6017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13495, "tokens_per_second_per_gpu": 10283.94, "total_tokens": 1332337678 }, { "epoch": 0.843710927731933, "grad_norm": 0.8775888085365295, "learning_rate": 2e-05, "loss": 0.5891, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13496, "tokens_per_second_per_gpu": 10053.1, "total_tokens": 1332434136 }, { "epoch": 0.8437734433608403, "grad_norm": 0.8817687630653381, "learning_rate": 2e-05, "loss": 0.6412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13497, "tokens_per_second_per_gpu": 10565.04, "total_tokens": 1332533868 }, { "epoch": 0.8438359589897474, "grad_norm": 0.88362056016922, "learning_rate": 2e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13498, "tokens_per_second_per_gpu": 11101.81, "total_tokens": 1332628803 }, { "epoch": 0.8438984746186546, "grad_norm": 0.9017900824546814, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13499, "tokens_per_second_per_gpu": 10956.19, "total_tokens": 1332727851 }, { "epoch": 0.8439609902475619, "grad_norm": 0.8708425164222717, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13500, "tokens_per_second_per_gpu": 10852.92, "total_tokens": 1332826551 }, { "epoch": 0.8440235058764691, "grad_norm": 0.9103457927703857, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13501, "tokens_per_second_per_gpu": 9707.5, "total_tokens": 1332919051 }, { "epoch": 0.8440860215053764, "grad_norm": 0.8948062658309937, "learning_rate": 2e-05, "loss": 0.6045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13502, "tokens_per_second_per_gpu": 10518.23, "total_tokens": 1333014175 }, { "epoch": 0.8441485371342836, "grad_norm": 0.9299184083938599, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13503, "tokens_per_second_per_gpu": 9890.01, "total_tokens": 1333104513 }, { "epoch": 0.8442110527631908, "grad_norm": 0.9154597520828247, "learning_rate": 2e-05, "loss": 0.5913, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13504, "tokens_per_second_per_gpu": 10302.19, "total_tokens": 1333204059 }, { "epoch": 0.844273568392098, "grad_norm": 0.9240472912788391, "learning_rate": 2e-05, "loss": 0.5737, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13505, "tokens_per_second_per_gpu": 9824.57, "total_tokens": 1333298425 }, { "epoch": 0.8443360840210052, "grad_norm": 0.9508880376815796, "learning_rate": 2e-05, "loss": 0.6828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13506, "tokens_per_second_per_gpu": 10428.35, "total_tokens": 1333396159 }, { "epoch": 0.8443985996499125, "grad_norm": 0.9372788071632385, "learning_rate": 2e-05, "loss": 0.6814, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13507, "tokens_per_second_per_gpu": 10952.06, "total_tokens": 1333499267 }, { "epoch": 0.8444611152788197, "grad_norm": 0.8778693079948425, "learning_rate": 2e-05, "loss": 0.6086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13508, "tokens_per_second_per_gpu": 11160.53, "total_tokens": 1333597244 }, { "epoch": 0.844523630907727, "grad_norm": 0.903480589389801, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13509, "tokens_per_second_per_gpu": 9631.07, "total_tokens": 1333695696 }, { "epoch": 0.8445861465366341, "grad_norm": 0.9585962295532227, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13510, "tokens_per_second_per_gpu": 10754.96, "total_tokens": 1333792285 }, { "epoch": 0.8446486621655414, "grad_norm": 0.890640377998352, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13511, "tokens_per_second_per_gpu": 10815.26, "total_tokens": 1333893898 }, { "epoch": 0.8447111777944486, "grad_norm": 0.9084611535072327, "learning_rate": 2e-05, "loss": 0.576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13512, "tokens_per_second_per_gpu": 9898.03, "total_tokens": 1333987544 }, { "epoch": 0.8447736934233558, "grad_norm": 0.8527373671531677, "learning_rate": 2e-05, "loss": 0.6294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13513, "tokens_per_second_per_gpu": 10632.18, "total_tokens": 1334089743 }, { "epoch": 0.8448362090522631, "grad_norm": 0.9119742512702942, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13514, "tokens_per_second_per_gpu": 10413.08, "total_tokens": 1334188227 }, { "epoch": 0.8448987246811703, "grad_norm": 0.8751733899116516, "learning_rate": 2e-05, "loss": 0.577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13515, "tokens_per_second_per_gpu": 10217.76, "total_tokens": 1334288484 }, { "epoch": 0.8449612403100775, "grad_norm": 0.877177357673645, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13516, "tokens_per_second_per_gpu": 10642.95, "total_tokens": 1334390518 }, { "epoch": 0.8450237559389847, "grad_norm": 0.9229588508605957, "learning_rate": 2e-05, "loss": 0.6168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13517, "tokens_per_second_per_gpu": 10863.38, "total_tokens": 1334487931 }, { "epoch": 0.845086271567892, "grad_norm": 0.8895165920257568, "learning_rate": 2e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13518, "tokens_per_second_per_gpu": 10332.89, "total_tokens": 1334584445 }, { "epoch": 0.8451487871967992, "grad_norm": 0.8918160796165466, "learning_rate": 2e-05, "loss": 0.5764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13519, "tokens_per_second_per_gpu": 9695.69, "total_tokens": 1334678842 }, { "epoch": 0.8452113028257064, "grad_norm": 0.885221540927887, "learning_rate": 2e-05, "loss": 0.6195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13520, "tokens_per_second_per_gpu": 10461.39, "total_tokens": 1334777527 }, { "epoch": 0.8452738184546137, "grad_norm": 0.8708287477493286, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13521, "tokens_per_second_per_gpu": 10817.29, "total_tokens": 1334880662 }, { "epoch": 0.8453363340835209, "grad_norm": 0.8884483575820923, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13522, "tokens_per_second_per_gpu": 10733.21, "total_tokens": 1334982532 }, { "epoch": 0.8453988497124281, "grad_norm": 0.8795500993728638, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13523, "tokens_per_second_per_gpu": 10702.98, "total_tokens": 1335079793 }, { "epoch": 0.8454613653413353, "grad_norm": 0.8913249969482422, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13524, "tokens_per_second_per_gpu": 10392.21, "total_tokens": 1335178998 }, { "epoch": 0.8455238809702426, "grad_norm": 0.892024040222168, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13525, "tokens_per_second_per_gpu": 10670.99, "total_tokens": 1335280389 }, { "epoch": 0.8455863965991498, "grad_norm": 0.8925288319587708, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13526, "tokens_per_second_per_gpu": 10028.13, "total_tokens": 1335380400 }, { "epoch": 0.845648912228057, "grad_norm": 0.8971315622329712, "learning_rate": 2e-05, "loss": 0.5871, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13527, "tokens_per_second_per_gpu": 9386.89, "total_tokens": 1335476289 }, { "epoch": 0.8457114278569643, "grad_norm": 0.8707805871963501, "learning_rate": 2e-05, "loss": 0.5763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13528, "tokens_per_second_per_gpu": 10531.0, "total_tokens": 1335575925 }, { "epoch": 0.8457739434858714, "grad_norm": 0.9045200347900391, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13529, "tokens_per_second_per_gpu": 10373.53, "total_tokens": 1335672283 }, { "epoch": 0.8458364591147787, "grad_norm": 0.8725362420082092, "learning_rate": 2e-05, "loss": 0.6087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13530, "tokens_per_second_per_gpu": 10286.15, "total_tokens": 1335770137 }, { "epoch": 0.8458989747436859, "grad_norm": 0.8434605002403259, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13531, "tokens_per_second_per_gpu": 10571.84, "total_tokens": 1335873458 }, { "epoch": 0.8459614903725932, "grad_norm": 0.8715245127677917, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13532, "tokens_per_second_per_gpu": 10705.94, "total_tokens": 1335972987 }, { "epoch": 0.8460240060015004, "grad_norm": 0.8967203497886658, "learning_rate": 2e-05, "loss": 0.5877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13533, "tokens_per_second_per_gpu": 9978.33, "total_tokens": 1336067802 }, { "epoch": 0.8460865216304077, "grad_norm": 0.8485281467437744, "learning_rate": 2e-05, "loss": 0.5902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13534, "tokens_per_second_per_gpu": 11083.11, "total_tokens": 1336169393 }, { "epoch": 0.8461490372593148, "grad_norm": 0.8560484647750854, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13535, "tokens_per_second_per_gpu": 10878.24, "total_tokens": 1336270643 }, { "epoch": 0.846211552888222, "grad_norm": 0.9100937247276306, "learning_rate": 2e-05, "loss": 0.6551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13536, "tokens_per_second_per_gpu": 11357.0, "total_tokens": 1336369814 }, { "epoch": 0.8462740685171293, "grad_norm": 0.8929433226585388, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13537, "tokens_per_second_per_gpu": 10944.69, "total_tokens": 1336468505 }, { "epoch": 0.8463365841460365, "grad_norm": 0.8601292371749878, "learning_rate": 2e-05, "loss": 0.592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13538, "tokens_per_second_per_gpu": 10432.0, "total_tokens": 1336567266 }, { "epoch": 0.8463990997749438, "grad_norm": 0.8748618960380554, "learning_rate": 2e-05, "loss": 0.5891, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13539, "tokens_per_second_per_gpu": 9769.32, "total_tokens": 1336666171 }, { "epoch": 0.846461615403851, "grad_norm": 0.880632758140564, "learning_rate": 2e-05, "loss": 0.6405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13540, "tokens_per_second_per_gpu": 9902.04, "total_tokens": 1336765108 }, { "epoch": 0.8465241310327581, "grad_norm": 0.8739454746246338, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13541, "tokens_per_second_per_gpu": 10025.82, "total_tokens": 1336863115 }, { "epoch": 0.8465866466616654, "grad_norm": 0.8598803877830505, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13542, "tokens_per_second_per_gpu": 10632.26, "total_tokens": 1336963703 }, { "epoch": 0.8466491622905726, "grad_norm": 0.8735783696174622, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13543, "tokens_per_second_per_gpu": 10684.17, "total_tokens": 1337061280 }, { "epoch": 0.8467116779194799, "grad_norm": 0.9201939702033997, "learning_rate": 2e-05, "loss": 0.6462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13544, "tokens_per_second_per_gpu": 11424.01, "total_tokens": 1337158602 }, { "epoch": 0.8467741935483871, "grad_norm": 0.8560792803764343, "learning_rate": 2e-05, "loss": 0.5891, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13545, "tokens_per_second_per_gpu": 10871.65, "total_tokens": 1337260394 }, { "epoch": 0.8468367091772944, "grad_norm": 0.8868287801742554, "learning_rate": 2e-05, "loss": 0.5901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13546, "tokens_per_second_per_gpu": 11197.65, "total_tokens": 1337361379 }, { "epoch": 0.8468992248062015, "grad_norm": 0.8879423141479492, "learning_rate": 2e-05, "loss": 0.5934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13547, "tokens_per_second_per_gpu": 10588.39, "total_tokens": 1337458832 }, { "epoch": 0.8469617404351087, "grad_norm": 0.8672919273376465, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13548, "tokens_per_second_per_gpu": 10681.34, "total_tokens": 1337561307 }, { "epoch": 0.847024256064016, "grad_norm": 0.8564050197601318, "learning_rate": 2e-05, "loss": 0.5927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13549, "tokens_per_second_per_gpu": 11429.54, "total_tokens": 1337661675 }, { "epoch": 0.8470867716929232, "grad_norm": 0.9154086709022522, "learning_rate": 2e-05, "loss": 0.6374, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13550, "tokens_per_second_per_gpu": 10311.6, "total_tokens": 1337759307 }, { "epoch": 0.8471492873218305, "grad_norm": 0.8623580932617188, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13551, "tokens_per_second_per_gpu": 11133.6, "total_tokens": 1337861540 }, { "epoch": 0.8472118029507377, "grad_norm": 0.8911352157592773, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13552, "tokens_per_second_per_gpu": 10478.58, "total_tokens": 1337958485 }, { "epoch": 0.8472743185796449, "grad_norm": 0.9295874238014221, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13553, "tokens_per_second_per_gpu": 11426.2, "total_tokens": 1338062195 }, { "epoch": 0.8473368342085521, "grad_norm": 0.9158475399017334, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13554, "tokens_per_second_per_gpu": 11081.19, "total_tokens": 1338158773 }, { "epoch": 0.8473993498374593, "grad_norm": 0.8867318630218506, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13555, "tokens_per_second_per_gpu": 10787.04, "total_tokens": 1338255488 }, { "epoch": 0.8474618654663666, "grad_norm": 0.8507237434387207, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13556, "tokens_per_second_per_gpu": 10990.93, "total_tokens": 1338356751 }, { "epoch": 0.8475243810952738, "grad_norm": 0.9061272144317627, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13557, "tokens_per_second_per_gpu": 10758.54, "total_tokens": 1338453716 }, { "epoch": 0.8475868967241811, "grad_norm": 0.8890382051467896, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13558, "tokens_per_second_per_gpu": 10303.21, "total_tokens": 1338551042 }, { "epoch": 0.8476494123530883, "grad_norm": 0.9120293855667114, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13559, "tokens_per_second_per_gpu": 10132.2, "total_tokens": 1338649605 }, { "epoch": 0.8477119279819955, "grad_norm": 0.8965747356414795, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13560, "tokens_per_second_per_gpu": 15714.83, "total_tokens": 1338751534 }, { "epoch": 0.8477744436109027, "grad_norm": 0.8846962451934814, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13561, "tokens_per_second_per_gpu": 10820.62, "total_tokens": 1338852499 }, { "epoch": 0.84783695923981, "grad_norm": 0.9323542714118958, "learning_rate": 2e-05, "loss": 0.6428, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13562, "tokens_per_second_per_gpu": 10728.75, "total_tokens": 1338953306 }, { "epoch": 0.8478994748687172, "grad_norm": 0.9121949672698975, "learning_rate": 2e-05, "loss": 0.6629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13563, "tokens_per_second_per_gpu": 10287.96, "total_tokens": 1339053994 }, { "epoch": 0.8479619904976244, "grad_norm": 0.900324285030365, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13564, "tokens_per_second_per_gpu": 10190.67, "total_tokens": 1339155202 }, { "epoch": 0.8480245061265317, "grad_norm": 0.8600577116012573, "learning_rate": 2e-05, "loss": 0.5733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13565, "tokens_per_second_per_gpu": 10468.81, "total_tokens": 1339252705 }, { "epoch": 0.8480870217554388, "grad_norm": 0.8836600184440613, "learning_rate": 2e-05, "loss": 0.6755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13566, "tokens_per_second_per_gpu": 11059.3, "total_tokens": 1339356068 }, { "epoch": 0.8481495373843461, "grad_norm": 0.9384081959724426, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13567, "tokens_per_second_per_gpu": 9708.99, "total_tokens": 1339451122 }, { "epoch": 0.8482120530132533, "grad_norm": 0.8689131140708923, "learning_rate": 2e-05, "loss": 0.5792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13568, "tokens_per_second_per_gpu": 10214.55, "total_tokens": 1339544659 }, { "epoch": 0.8482745686421606, "grad_norm": 0.8828728199005127, "learning_rate": 2e-05, "loss": 0.5962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13569, "tokens_per_second_per_gpu": 9672.92, "total_tokens": 1339641398 }, { "epoch": 0.8483370842710678, "grad_norm": 0.9016878008842468, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13570, "tokens_per_second_per_gpu": 10221.5, "total_tokens": 1339739806 }, { "epoch": 0.848399599899975, "grad_norm": 0.9123885631561279, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13571, "tokens_per_second_per_gpu": 10416.59, "total_tokens": 1339842610 }, { "epoch": 0.8484621155288822, "grad_norm": 0.894901692867279, "learning_rate": 2e-05, "loss": 0.6294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13572, "tokens_per_second_per_gpu": 10561.28, "total_tokens": 1339940968 }, { "epoch": 0.8485246311577894, "grad_norm": 0.9022228717803955, "learning_rate": 2e-05, "loss": 0.6049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13573, "tokens_per_second_per_gpu": 10957.06, "total_tokens": 1340039373 }, { "epoch": 0.8485871467866967, "grad_norm": 0.9376405477523804, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13574, "tokens_per_second_per_gpu": 10171.23, "total_tokens": 1340133061 }, { "epoch": 0.8486496624156039, "grad_norm": 0.8855889439582825, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13575, "tokens_per_second_per_gpu": 10331.11, "total_tokens": 1340230007 }, { "epoch": 0.8487121780445112, "grad_norm": 0.8735007643699646, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13576, "tokens_per_second_per_gpu": 10596.84, "total_tokens": 1340328482 }, { "epoch": 0.8487746936734184, "grad_norm": 0.8815125823020935, "learning_rate": 2e-05, "loss": 0.6041, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13577, "tokens_per_second_per_gpu": 10263.29, "total_tokens": 1340425620 }, { "epoch": 0.8488372093023255, "grad_norm": 0.9108851552009583, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13578, "tokens_per_second_per_gpu": 10069.66, "total_tokens": 1340521561 }, { "epoch": 0.8488997249312328, "grad_norm": 0.9066846370697021, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13579, "tokens_per_second_per_gpu": 9280.11, "total_tokens": 1340615129 }, { "epoch": 0.84896224056014, "grad_norm": 0.9822290539741516, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13580, "tokens_per_second_per_gpu": 9381.17, "total_tokens": 1340709205 }, { "epoch": 0.8490247561890473, "grad_norm": 0.8999797105789185, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13581, "tokens_per_second_per_gpu": 9998.71, "total_tokens": 1340803137 }, { "epoch": 0.8490872718179545, "grad_norm": 0.912473738193512, "learning_rate": 2e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13582, "tokens_per_second_per_gpu": 10441.43, "total_tokens": 1340903118 }, { "epoch": 0.8491497874468618, "grad_norm": 0.9776929020881653, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13583, "tokens_per_second_per_gpu": 10803.91, "total_tokens": 1341002438 }, { "epoch": 0.8492123030757689, "grad_norm": 0.9373757839202881, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13584, "tokens_per_second_per_gpu": 10160.92, "total_tokens": 1341102718 }, { "epoch": 0.8492748187046761, "grad_norm": 0.8945867419242859, "learning_rate": 2e-05, "loss": 0.5687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13585, "tokens_per_second_per_gpu": 9708.29, "total_tokens": 1341196301 }, { "epoch": 0.8493373343335834, "grad_norm": 0.8957825303077698, "learning_rate": 2e-05, "loss": 0.6085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13586, "tokens_per_second_per_gpu": 10906.3, "total_tokens": 1341294234 }, { "epoch": 0.8493998499624906, "grad_norm": 0.8770196437835693, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13587, "tokens_per_second_per_gpu": 11062.97, "total_tokens": 1341394860 }, { "epoch": 0.8494623655913979, "grad_norm": 0.9031738638877869, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13588, "tokens_per_second_per_gpu": 10668.36, "total_tokens": 1341491808 }, { "epoch": 0.8495248812203051, "grad_norm": 0.8717968463897705, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13589, "tokens_per_second_per_gpu": 10596.35, "total_tokens": 1341592227 }, { "epoch": 0.8495873968492123, "grad_norm": 0.8960423469543457, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13590, "tokens_per_second_per_gpu": 10599.47, "total_tokens": 1341693675 }, { "epoch": 0.8496499124781195, "grad_norm": 0.8752831220626831, "learning_rate": 2e-05, "loss": 0.5945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13591, "tokens_per_second_per_gpu": 10176.28, "total_tokens": 1341792163 }, { "epoch": 0.8497124281070267, "grad_norm": 0.8744347095489502, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13592, "tokens_per_second_per_gpu": 10548.67, "total_tokens": 1341893357 }, { "epoch": 0.849774943735934, "grad_norm": 0.9029711484909058, "learning_rate": 2e-05, "loss": 0.5807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13593, "tokens_per_second_per_gpu": 10099.67, "total_tokens": 1341989005 }, { "epoch": 0.8498374593648412, "grad_norm": 0.8780010938644409, "learning_rate": 2e-05, "loss": 0.5953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13594, "tokens_per_second_per_gpu": 10191.89, "total_tokens": 1342083851 }, { "epoch": 0.8498999749937485, "grad_norm": 0.8807610273361206, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13595, "tokens_per_second_per_gpu": 10473.5, "total_tokens": 1342182214 }, { "epoch": 0.8499624906226556, "grad_norm": 0.897663414478302, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13596, "tokens_per_second_per_gpu": 9924.51, "total_tokens": 1342274425 }, { "epoch": 0.8500250062515629, "grad_norm": 0.8770773410797119, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13597, "tokens_per_second_per_gpu": 10104.89, "total_tokens": 1342373714 }, { "epoch": 0.8500875218804701, "grad_norm": 0.9136184453964233, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13598, "tokens_per_second_per_gpu": 9384.66, "total_tokens": 1342469031 }, { "epoch": 0.8501500375093773, "grad_norm": 0.8937616348266602, "learning_rate": 2e-05, "loss": 0.6537, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13599, "tokens_per_second_per_gpu": 10969.5, "total_tokens": 1342570112 }, { "epoch": 0.8502125531382846, "grad_norm": 0.8780198693275452, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13600, "tokens_per_second_per_gpu": 10221.58, "total_tokens": 1342667076 }, { "epoch": 0.8502750687671918, "grad_norm": 0.8741934895515442, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13601, "tokens_per_second_per_gpu": 11035.98, "total_tokens": 1342767298 }, { "epoch": 0.8503375843960991, "grad_norm": 0.9018401503562927, "learning_rate": 2e-05, "loss": 0.5939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13602, "tokens_per_second_per_gpu": 10665.09, "total_tokens": 1342862762 }, { "epoch": 0.8504001000250062, "grad_norm": 0.8380753993988037, "learning_rate": 2e-05, "loss": 0.6056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13603, "tokens_per_second_per_gpu": 10895.8, "total_tokens": 1342963650 }, { "epoch": 0.8504626156539135, "grad_norm": 0.9040842652320862, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13604, "tokens_per_second_per_gpu": 10409.71, "total_tokens": 1343062593 }, { "epoch": 0.8505251312828207, "grad_norm": 0.8979394435882568, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13605, "tokens_per_second_per_gpu": 11413.13, "total_tokens": 1343163026 }, { "epoch": 0.850587646911728, "grad_norm": 0.8431503176689148, "learning_rate": 2e-05, "loss": 0.6048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13606, "tokens_per_second_per_gpu": 11183.98, "total_tokens": 1343264789 }, { "epoch": 0.8506501625406352, "grad_norm": 0.8677700161933899, "learning_rate": 2e-05, "loss": 0.6575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13607, "tokens_per_second_per_gpu": 11295.41, "total_tokens": 1343366722 }, { "epoch": 0.8507126781695424, "grad_norm": 0.8916361331939697, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13608, "tokens_per_second_per_gpu": 11171.89, "total_tokens": 1343470424 }, { "epoch": 0.8507751937984496, "grad_norm": 0.8892317414283752, "learning_rate": 2e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13609, "tokens_per_second_per_gpu": 10622.15, "total_tokens": 1343572498 }, { "epoch": 0.8508377094273568, "grad_norm": 0.8951034545898438, "learning_rate": 2e-05, "loss": 0.5882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13610, "tokens_per_second_per_gpu": 10376.23, "total_tokens": 1343669706 }, { "epoch": 0.8509002250562641, "grad_norm": 0.9154970645904541, "learning_rate": 2e-05, "loss": 0.582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13611, "tokens_per_second_per_gpu": 10073.01, "total_tokens": 1343767670 }, { "epoch": 0.8509627406851713, "grad_norm": 0.8664509654045105, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13612, "tokens_per_second_per_gpu": 10748.13, "total_tokens": 1343868948 }, { "epoch": 0.8510252563140785, "grad_norm": 0.8662587404251099, "learning_rate": 2e-05, "loss": 0.5851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13613, "tokens_per_second_per_gpu": 10628.56, "total_tokens": 1343972199 }, { "epoch": 0.8510877719429858, "grad_norm": 0.8812136650085449, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13614, "tokens_per_second_per_gpu": 10455.18, "total_tokens": 1344069506 }, { "epoch": 0.8511502875718929, "grad_norm": 0.9469527006149292, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13615, "tokens_per_second_per_gpu": 10222.32, "total_tokens": 1344166351 }, { "epoch": 0.8512128032008002, "grad_norm": 0.8954537510871887, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13616, "tokens_per_second_per_gpu": 10589.05, "total_tokens": 1344263791 }, { "epoch": 0.8512753188297074, "grad_norm": 0.8834907412528992, "learning_rate": 2e-05, "loss": 0.6085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13617, "tokens_per_second_per_gpu": 10259.32, "total_tokens": 1344361850 }, { "epoch": 0.8513378344586147, "grad_norm": 0.9303449988365173, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13618, "tokens_per_second_per_gpu": 10159.5, "total_tokens": 1344461589 }, { "epoch": 0.8514003500875219, "grad_norm": 0.8571428060531616, "learning_rate": 2e-05, "loss": 0.5811, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13619, "tokens_per_second_per_gpu": 9898.66, "total_tokens": 1344555621 }, { "epoch": 0.8514628657164292, "grad_norm": 0.8950944542884827, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13620, "tokens_per_second_per_gpu": 10645.81, "total_tokens": 1344655445 }, { "epoch": 0.8515253813453363, "grad_norm": 0.9564082026481628, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13621, "tokens_per_second_per_gpu": 9611.55, "total_tokens": 1344751792 }, { "epoch": 0.8515878969742435, "grad_norm": 0.8917484879493713, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13622, "tokens_per_second_per_gpu": 10798.14, "total_tokens": 1344850139 }, { "epoch": 0.8516504126031508, "grad_norm": 0.913727879524231, "learning_rate": 2e-05, "loss": 0.6302, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13623, "tokens_per_second_per_gpu": 11130.52, "total_tokens": 1344951242 }, { "epoch": 0.851712928232058, "grad_norm": 0.9073680639266968, "learning_rate": 2e-05, "loss": 0.5855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13624, "tokens_per_second_per_gpu": 10810.6, "total_tokens": 1345049766 }, { "epoch": 0.8517754438609653, "grad_norm": 0.9195286631584167, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13625, "tokens_per_second_per_gpu": 10578.66, "total_tokens": 1345149814 }, { "epoch": 0.8518379594898725, "grad_norm": 0.9182800054550171, "learning_rate": 2e-05, "loss": 0.5969, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13626, "tokens_per_second_per_gpu": 10610.69, "total_tokens": 1345252869 }, { "epoch": 0.8519004751187796, "grad_norm": 0.9012035131454468, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13627, "tokens_per_second_per_gpu": 9039.88, "total_tokens": 1345346857 }, { "epoch": 0.8519629907476869, "grad_norm": 0.877125084400177, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13628, "tokens_per_second_per_gpu": 10908.29, "total_tokens": 1345446896 }, { "epoch": 0.8520255063765941, "grad_norm": 0.8540126085281372, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13629, "tokens_per_second_per_gpu": 10789.48, "total_tokens": 1345549700 }, { "epoch": 0.8520880220055014, "grad_norm": 0.8777673244476318, "learning_rate": 2e-05, "loss": 0.5741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13630, "tokens_per_second_per_gpu": 10124.29, "total_tokens": 1345648772 }, { "epoch": 0.8521505376344086, "grad_norm": 0.8967000246047974, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13631, "tokens_per_second_per_gpu": 10575.54, "total_tokens": 1345743751 }, { "epoch": 0.8522130532633159, "grad_norm": 0.8767430186271667, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13632, "tokens_per_second_per_gpu": 10993.43, "total_tokens": 1345845742 }, { "epoch": 0.852275568892223, "grad_norm": 0.9296412467956543, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13633, "tokens_per_second_per_gpu": 10149.03, "total_tokens": 1345939306 }, { "epoch": 0.8523380845211302, "grad_norm": 0.9575696587562561, "learning_rate": 2e-05, "loss": 0.5952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13634, "tokens_per_second_per_gpu": 10348.84, "total_tokens": 1346032728 }, { "epoch": 0.8524006001500375, "grad_norm": 0.8984970450401306, "learning_rate": 2e-05, "loss": 0.5584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13635, "tokens_per_second_per_gpu": 10260.68, "total_tokens": 1346127187 }, { "epoch": 0.8524631157789447, "grad_norm": 0.9601854085922241, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13636, "tokens_per_second_per_gpu": 10492.45, "total_tokens": 1346228971 }, { "epoch": 0.852525631407852, "grad_norm": 0.9303641319274902, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13637, "tokens_per_second_per_gpu": 10898.49, "total_tokens": 1346327820 }, { "epoch": 0.8525881470367592, "grad_norm": 0.9279441237449646, "learning_rate": 2e-05, "loss": 0.6577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13638, "tokens_per_second_per_gpu": 10669.74, "total_tokens": 1346428269 }, { "epoch": 0.8526506626656665, "grad_norm": 0.8940821886062622, "learning_rate": 2e-05, "loss": 0.5939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13639, "tokens_per_second_per_gpu": 10893.67, "total_tokens": 1346528541 }, { "epoch": 0.8527131782945736, "grad_norm": 0.8731138706207275, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13640, "tokens_per_second_per_gpu": 10678.11, "total_tokens": 1346632658 }, { "epoch": 0.8527756939234808, "grad_norm": 0.8914170265197754, "learning_rate": 2e-05, "loss": 0.5599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13641, "tokens_per_second_per_gpu": 9408.02, "total_tokens": 1346721485 }, { "epoch": 0.8528382095523881, "grad_norm": 0.8943724036216736, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13642, "tokens_per_second_per_gpu": 10835.42, "total_tokens": 1346822639 }, { "epoch": 0.8529007251812953, "grad_norm": 0.927193820476532, "learning_rate": 2e-05, "loss": 0.6571, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13643, "tokens_per_second_per_gpu": 10564.81, "total_tokens": 1346921496 }, { "epoch": 0.8529632408102026, "grad_norm": 0.9053352475166321, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13644, "tokens_per_second_per_gpu": 11062.84, "total_tokens": 1347022865 }, { "epoch": 0.8530257564391098, "grad_norm": 0.8724948167800903, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13645, "tokens_per_second_per_gpu": 11047.27, "total_tokens": 1347125526 }, { "epoch": 0.853088272068017, "grad_norm": 0.8789075613021851, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13646, "tokens_per_second_per_gpu": 10998.77, "total_tokens": 1347226581 }, { "epoch": 0.8531507876969242, "grad_norm": 0.9295552968978882, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13647, "tokens_per_second_per_gpu": 9857.91, "total_tokens": 1347325714 }, { "epoch": 0.8532133033258315, "grad_norm": 0.9038299322128296, "learning_rate": 2e-05, "loss": 0.6512, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13648, "tokens_per_second_per_gpu": 10849.26, "total_tokens": 1347431751 }, { "epoch": 0.8532758189547387, "grad_norm": 0.857503354549408, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13649, "tokens_per_second_per_gpu": 10608.51, "total_tokens": 1347531475 }, { "epoch": 0.8533383345836459, "grad_norm": 0.8940430283546448, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13650, "tokens_per_second_per_gpu": 10075.59, "total_tokens": 1347626328 }, { "epoch": 0.8534008502125532, "grad_norm": 0.9021949172019958, "learning_rate": 2e-05, "loss": 0.5922, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13651, "tokens_per_second_per_gpu": 10126.96, "total_tokens": 1347725673 }, { "epoch": 0.8534633658414603, "grad_norm": 0.9243302941322327, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13652, "tokens_per_second_per_gpu": 10325.04, "total_tokens": 1347826017 }, { "epoch": 0.8535258814703676, "grad_norm": 0.9009075164794922, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13653, "tokens_per_second_per_gpu": 11545.22, "total_tokens": 1347926517 }, { "epoch": 0.8535883970992748, "grad_norm": 0.9058691263198853, "learning_rate": 2e-05, "loss": 0.6146, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13654, "tokens_per_second_per_gpu": 10457.37, "total_tokens": 1348026218 }, { "epoch": 0.853650912728182, "grad_norm": 0.928436279296875, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13655, "tokens_per_second_per_gpu": 10826.18, "total_tokens": 1348122919 }, { "epoch": 0.8537134283570893, "grad_norm": 0.8823308944702148, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13656, "tokens_per_second_per_gpu": 10289.02, "total_tokens": 1348221848 }, { "epoch": 0.8537759439859965, "grad_norm": 0.8625016212463379, "learning_rate": 2e-05, "loss": 0.5783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13657, "tokens_per_second_per_gpu": 10945.83, "total_tokens": 1348319068 }, { "epoch": 0.8538384596149037, "grad_norm": 0.8799179792404175, "learning_rate": 2e-05, "loss": 0.5909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13658, "tokens_per_second_per_gpu": 9874.95, "total_tokens": 1348418684 }, { "epoch": 0.8539009752438109, "grad_norm": 0.9165480136871338, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13659, "tokens_per_second_per_gpu": 10183.36, "total_tokens": 1348513979 }, { "epoch": 0.8539634908727182, "grad_norm": 0.857418954372406, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13660, "tokens_per_second_per_gpu": 10428.03, "total_tokens": 1348616010 }, { "epoch": 0.8540260065016254, "grad_norm": 0.8704392313957214, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13661, "tokens_per_second_per_gpu": 10964.0, "total_tokens": 1348719646 }, { "epoch": 0.8540885221305327, "grad_norm": 0.8929861783981323, "learning_rate": 2e-05, "loss": 0.5865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13662, "tokens_per_second_per_gpu": 10932.83, "total_tokens": 1348818489 }, { "epoch": 0.8541510377594399, "grad_norm": 0.8467332720756531, "learning_rate": 2e-05, "loss": 0.5899, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13663, "tokens_per_second_per_gpu": 10628.81, "total_tokens": 1348919149 }, { "epoch": 0.854213553388347, "grad_norm": 0.9236463904380798, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13664, "tokens_per_second_per_gpu": 10811.59, "total_tokens": 1349020358 }, { "epoch": 0.8542760690172543, "grad_norm": 0.9406234622001648, "learning_rate": 2e-05, "loss": 0.6685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13665, "tokens_per_second_per_gpu": 10971.67, "total_tokens": 1349122082 }, { "epoch": 0.8543385846461615, "grad_norm": 0.8515204191207886, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13666, "tokens_per_second_per_gpu": 10666.23, "total_tokens": 1349223245 }, { "epoch": 0.8544011002750688, "grad_norm": 0.9144315123558044, "learning_rate": 2e-05, "loss": 0.6317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13667, "tokens_per_second_per_gpu": 9085.78, "total_tokens": 1349317300 }, { "epoch": 0.854463615903976, "grad_norm": 0.916345477104187, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13668, "tokens_per_second_per_gpu": 10180.56, "total_tokens": 1349412044 }, { "epoch": 0.8545261315328833, "grad_norm": 0.9006975889205933, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13669, "tokens_per_second_per_gpu": 10728.52, "total_tokens": 1349514447 }, { "epoch": 0.8545886471617904, "grad_norm": 0.8790720105171204, "learning_rate": 2e-05, "loss": 0.5986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13670, "tokens_per_second_per_gpu": 10480.27, "total_tokens": 1349614398 }, { "epoch": 0.8546511627906976, "grad_norm": 0.8794605731964111, "learning_rate": 2e-05, "loss": 0.5874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13671, "tokens_per_second_per_gpu": 10064.4, "total_tokens": 1349710225 }, { "epoch": 0.8547136784196049, "grad_norm": 0.8774428963661194, "learning_rate": 2e-05, "loss": 0.594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13672, "tokens_per_second_per_gpu": 10296.29, "total_tokens": 1349810591 }, { "epoch": 0.8547761940485121, "grad_norm": 0.8967084884643555, "learning_rate": 2e-05, "loss": 0.6628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13673, "tokens_per_second_per_gpu": 10958.16, "total_tokens": 1349911646 }, { "epoch": 0.8548387096774194, "grad_norm": 0.8619900941848755, "learning_rate": 2e-05, "loss": 0.554, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13674, "tokens_per_second_per_gpu": 10308.23, "total_tokens": 1350007491 }, { "epoch": 0.8549012253063266, "grad_norm": 0.8922132849693298, "learning_rate": 2e-05, "loss": 0.5781, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13675, "tokens_per_second_per_gpu": 9867.75, "total_tokens": 1350101738 }, { "epoch": 0.8549637409352339, "grad_norm": 0.872266411781311, "learning_rate": 2e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13676, "tokens_per_second_per_gpu": 10463.73, "total_tokens": 1350199655 }, { "epoch": 0.855026256564141, "grad_norm": 0.9711304903030396, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13677, "tokens_per_second_per_gpu": 10969.24, "total_tokens": 1350300689 }, { "epoch": 0.8550887721930482, "grad_norm": 0.9226827025413513, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13678, "tokens_per_second_per_gpu": 10251.55, "total_tokens": 1350394871 }, { "epoch": 0.8551512878219555, "grad_norm": 0.8900325298309326, "learning_rate": 2e-05, "loss": 0.594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13679, "tokens_per_second_per_gpu": 10356.62, "total_tokens": 1350492184 }, { "epoch": 0.8552138034508627, "grad_norm": 0.9004863500595093, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13680, "tokens_per_second_per_gpu": 10588.66, "total_tokens": 1350592685 }, { "epoch": 0.85527631907977, "grad_norm": 0.9041595458984375, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13681, "tokens_per_second_per_gpu": 11178.4, "total_tokens": 1350688896 }, { "epoch": 0.8553388347086772, "grad_norm": 0.8767504096031189, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13682, "tokens_per_second_per_gpu": 10500.25, "total_tokens": 1350786827 }, { "epoch": 0.8554013503375844, "grad_norm": 0.9146304130554199, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13683, "tokens_per_second_per_gpu": 9975.02, "total_tokens": 1350882609 }, { "epoch": 0.8554638659664916, "grad_norm": 0.8727186918258667, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13684, "tokens_per_second_per_gpu": 10644.62, "total_tokens": 1350981554 }, { "epoch": 0.8555263815953988, "grad_norm": 0.8578295707702637, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13685, "tokens_per_second_per_gpu": 11079.34, "total_tokens": 1351085658 }, { "epoch": 0.8555888972243061, "grad_norm": 0.8853726983070374, "learning_rate": 2e-05, "loss": 0.595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13686, "tokens_per_second_per_gpu": 9956.18, "total_tokens": 1351182782 }, { "epoch": 0.8556514128532133, "grad_norm": 0.86551433801651, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13687, "tokens_per_second_per_gpu": 10952.81, "total_tokens": 1351282958 }, { "epoch": 0.8557139284821206, "grad_norm": 0.8449913859367371, "learning_rate": 2e-05, "loss": 0.5923, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13688, "tokens_per_second_per_gpu": 10552.0, "total_tokens": 1351382217 }, { "epoch": 0.8557764441110277, "grad_norm": 0.8567749261856079, "learning_rate": 2e-05, "loss": 0.5766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13689, "tokens_per_second_per_gpu": 10622.45, "total_tokens": 1351481883 }, { "epoch": 0.855838959739935, "grad_norm": 0.8648229837417603, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13690, "tokens_per_second_per_gpu": 10639.71, "total_tokens": 1351583838 }, { "epoch": 0.8559014753688422, "grad_norm": 0.8894426226615906, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13691, "tokens_per_second_per_gpu": 11046.38, "total_tokens": 1351683544 }, { "epoch": 0.8559639909977494, "grad_norm": 0.9408916234970093, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13692, "tokens_per_second_per_gpu": 9564.05, "total_tokens": 1351775898 }, { "epoch": 0.8560265066266567, "grad_norm": 0.8856968879699707, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13693, "tokens_per_second_per_gpu": 10180.73, "total_tokens": 1351875578 }, { "epoch": 0.8560890222555639, "grad_norm": 0.8975003361701965, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13694, "tokens_per_second_per_gpu": 11087.75, "total_tokens": 1351981915 }, { "epoch": 0.8561515378844711, "grad_norm": 0.9400160312652588, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13695, "tokens_per_second_per_gpu": 10281.15, "total_tokens": 1352078765 }, { "epoch": 0.8562140535133783, "grad_norm": 0.8846915364265442, "learning_rate": 2e-05, "loss": 0.5877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13696, "tokens_per_second_per_gpu": 10548.16, "total_tokens": 1352177357 }, { "epoch": 0.8562765691422856, "grad_norm": 0.8643429279327393, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13697, "tokens_per_second_per_gpu": 10221.63, "total_tokens": 1352276879 }, { "epoch": 0.8563390847711928, "grad_norm": 0.8656125068664551, "learning_rate": 2e-05, "loss": 0.6328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13698, "tokens_per_second_per_gpu": 10425.01, "total_tokens": 1352378847 }, { "epoch": 0.8564016004001, "grad_norm": 0.9237455725669861, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13699, "tokens_per_second_per_gpu": 10426.33, "total_tokens": 1352478383 }, { "epoch": 0.8564641160290073, "grad_norm": 0.8759704232215881, "learning_rate": 2e-05, "loss": 0.6216, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13700, "tokens_per_second_per_gpu": 10490.36, "total_tokens": 1352577824 }, { "epoch": 0.8565266316579144, "grad_norm": 0.8782541155815125, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13701, "tokens_per_second_per_gpu": 10607.85, "total_tokens": 1352680360 }, { "epoch": 0.8565891472868217, "grad_norm": 0.8721879720687866, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13702, "tokens_per_second_per_gpu": 10560.32, "total_tokens": 1352778924 }, { "epoch": 0.8566516629157289, "grad_norm": 0.8716470003128052, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13703, "tokens_per_second_per_gpu": 10826.64, "total_tokens": 1352880167 }, { "epoch": 0.8567141785446362, "grad_norm": 0.8672653436660767, "learning_rate": 2e-05, "loss": 0.5699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13704, "tokens_per_second_per_gpu": 10248.29, "total_tokens": 1352976662 }, { "epoch": 0.8567766941735434, "grad_norm": 0.9101560115814209, "learning_rate": 2e-05, "loss": 0.587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13705, "tokens_per_second_per_gpu": 10789.32, "total_tokens": 1353072289 }, { "epoch": 0.8568392098024507, "grad_norm": 0.8606186509132385, "learning_rate": 2e-05, "loss": 0.5925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13706, "tokens_per_second_per_gpu": 10762.39, "total_tokens": 1353173823 }, { "epoch": 0.8569017254313578, "grad_norm": 0.8592342138290405, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13707, "tokens_per_second_per_gpu": 10881.42, "total_tokens": 1353274706 }, { "epoch": 0.856964241060265, "grad_norm": 0.8954876065254211, "learning_rate": 2e-05, "loss": 0.6487, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13708, "tokens_per_second_per_gpu": 10548.14, "total_tokens": 1353377507 }, { "epoch": 0.8570267566891723, "grad_norm": 0.9679699540138245, "learning_rate": 2e-05, "loss": 0.5902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13709, "tokens_per_second_per_gpu": 10736.23, "total_tokens": 1353477418 }, { "epoch": 0.8570892723180795, "grad_norm": 0.9255354404449463, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13710, "tokens_per_second_per_gpu": 10124.19, "total_tokens": 1353576824 }, { "epoch": 0.8571517879469868, "grad_norm": 0.9000644087791443, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13711, "tokens_per_second_per_gpu": 10666.89, "total_tokens": 1353676097 }, { "epoch": 0.857214303575894, "grad_norm": 0.94758540391922, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13712, "tokens_per_second_per_gpu": 10544.97, "total_tokens": 1353771970 }, { "epoch": 0.8572768192048013, "grad_norm": 0.9326270222663879, "learning_rate": 2e-05, "loss": 0.5844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13713, "tokens_per_second_per_gpu": 10226.7, "total_tokens": 1353869877 }, { "epoch": 0.8573393348337084, "grad_norm": 0.8632721304893494, "learning_rate": 2e-05, "loss": 0.5926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13714, "tokens_per_second_per_gpu": 10404.89, "total_tokens": 1353970108 }, { "epoch": 0.8574018504626156, "grad_norm": 0.8533925414085388, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13715, "tokens_per_second_per_gpu": 10621.24, "total_tokens": 1354070497 }, { "epoch": 0.8574643660915229, "grad_norm": 0.9250907301902771, "learning_rate": 2e-05, "loss": 0.5809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13716, "tokens_per_second_per_gpu": 9445.32, "total_tokens": 1354163571 }, { "epoch": 0.8575268817204301, "grad_norm": 0.8846310973167419, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13717, "tokens_per_second_per_gpu": 10990.59, "total_tokens": 1354265040 }, { "epoch": 0.8575893973493374, "grad_norm": 0.9023575782775879, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13718, "tokens_per_second_per_gpu": 10746.19, "total_tokens": 1354365665 }, { "epoch": 0.8576519129782446, "grad_norm": 0.853491485118866, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13719, "tokens_per_second_per_gpu": 10183.06, "total_tokens": 1354463896 }, { "epoch": 0.8577144286071517, "grad_norm": 0.8741627335548401, "learning_rate": 2e-05, "loss": 0.5748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13720, "tokens_per_second_per_gpu": 9728.54, "total_tokens": 1354559525 }, { "epoch": 0.857776944236059, "grad_norm": 0.9060618877410889, "learning_rate": 2e-05, "loss": 0.6561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13721, "tokens_per_second_per_gpu": 10488.82, "total_tokens": 1354659912 }, { "epoch": 0.8578394598649662, "grad_norm": 0.9419626593589783, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13722, "tokens_per_second_per_gpu": 10560.68, "total_tokens": 1354756211 }, { "epoch": 0.8579019754938735, "grad_norm": 0.8715477585792542, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13723, "tokens_per_second_per_gpu": 10294.97, "total_tokens": 1354858876 }, { "epoch": 0.8579644911227807, "grad_norm": 0.8795633316040039, "learning_rate": 2e-05, "loss": 0.6482, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13724, "tokens_per_second_per_gpu": 10915.73, "total_tokens": 1354961632 }, { "epoch": 0.858027006751688, "grad_norm": 0.8594920635223389, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13725, "tokens_per_second_per_gpu": 10724.99, "total_tokens": 1355065448 }, { "epoch": 0.8580895223805951, "grad_norm": 0.9083865880966187, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13726, "tokens_per_second_per_gpu": 10399.69, "total_tokens": 1355166395 }, { "epoch": 0.8581520380095023, "grad_norm": 0.9466552734375, "learning_rate": 2e-05, "loss": 0.6687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13727, "tokens_per_second_per_gpu": 10346.86, "total_tokens": 1355264669 }, { "epoch": 0.8582145536384096, "grad_norm": 0.9203512668609619, "learning_rate": 2e-05, "loss": 0.6635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13728, "tokens_per_second_per_gpu": 10798.29, "total_tokens": 1355366201 }, { "epoch": 0.8582770692673168, "grad_norm": 0.9607911705970764, "learning_rate": 2e-05, "loss": 0.5862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13729, "tokens_per_second_per_gpu": 9620.69, "total_tokens": 1355458627 }, { "epoch": 0.8583395848962241, "grad_norm": 0.8750358819961548, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13730, "tokens_per_second_per_gpu": 11049.81, "total_tokens": 1355558779 }, { "epoch": 0.8584021005251313, "grad_norm": 0.9321355819702148, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13731, "tokens_per_second_per_gpu": 10399.88, "total_tokens": 1355656089 }, { "epoch": 0.8584646161540385, "grad_norm": 0.9032673835754395, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13732, "tokens_per_second_per_gpu": 10334.58, "total_tokens": 1355751310 }, { "epoch": 0.8585271317829457, "grad_norm": 0.8825342059135437, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13733, "tokens_per_second_per_gpu": 10231.47, "total_tokens": 1355847590 }, { "epoch": 0.858589647411853, "grad_norm": 0.9545532464981079, "learning_rate": 2e-05, "loss": 0.7033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13734, "tokens_per_second_per_gpu": 10141.33, "total_tokens": 1355947293 }, { "epoch": 0.8586521630407602, "grad_norm": 0.8935046195983887, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13735, "tokens_per_second_per_gpu": 10787.92, "total_tokens": 1356049601 }, { "epoch": 0.8587146786696674, "grad_norm": 0.8817103505134583, "learning_rate": 2e-05, "loss": 0.6219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13736, "tokens_per_second_per_gpu": 11063.27, "total_tokens": 1356149851 }, { "epoch": 0.8587771942985747, "grad_norm": 0.9271260499954224, "learning_rate": 2e-05, "loss": 0.6019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13737, "tokens_per_second_per_gpu": 9864.0, "total_tokens": 1356245141 }, { "epoch": 0.8588397099274818, "grad_norm": 0.8698348999023438, "learning_rate": 2e-05, "loss": 0.5776, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13738, "tokens_per_second_per_gpu": 10523.14, "total_tokens": 1356344324 }, { "epoch": 0.8589022255563891, "grad_norm": 0.8813225030899048, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13739, "tokens_per_second_per_gpu": 10616.07, "total_tokens": 1356443804 }, { "epoch": 0.8589647411852963, "grad_norm": 0.9098532795906067, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13740, "tokens_per_second_per_gpu": 11209.96, "total_tokens": 1356545163 }, { "epoch": 0.8590272568142036, "grad_norm": 0.8852219581604004, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13741, "tokens_per_second_per_gpu": 10820.03, "total_tokens": 1356646657 }, { "epoch": 0.8590897724431108, "grad_norm": 0.9161791801452637, "learning_rate": 2e-05, "loss": 0.6105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13742, "tokens_per_second_per_gpu": 10937.38, "total_tokens": 1356749327 }, { "epoch": 0.859152288072018, "grad_norm": 1.0029914379119873, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13743, "tokens_per_second_per_gpu": 10849.17, "total_tokens": 1356853003 }, { "epoch": 0.8592148037009252, "grad_norm": 0.9118077158927917, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13744, "tokens_per_second_per_gpu": 9911.42, "total_tokens": 1356949736 }, { "epoch": 0.8592773193298324, "grad_norm": 0.9080346822738647, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13745, "tokens_per_second_per_gpu": 11112.72, "total_tokens": 1357050582 }, { "epoch": 0.8593398349587397, "grad_norm": 0.9767568707466125, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13746, "tokens_per_second_per_gpu": 10666.23, "total_tokens": 1357144202 }, { "epoch": 0.8594023505876469, "grad_norm": 0.8505994081497192, "learning_rate": 2e-05, "loss": 0.6019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13747, "tokens_per_second_per_gpu": 11313.29, "total_tokens": 1357249749 }, { "epoch": 0.8594648662165542, "grad_norm": 0.9427158236503601, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13748, "tokens_per_second_per_gpu": 10958.65, "total_tokens": 1357353853 }, { "epoch": 0.8595273818454614, "grad_norm": 0.8861249089241028, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13749, "tokens_per_second_per_gpu": 10523.81, "total_tokens": 1357453080 }, { "epoch": 0.8595898974743686, "grad_norm": 0.8645991683006287, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13750, "tokens_per_second_per_gpu": 11244.87, "total_tokens": 1357558587 }, { "epoch": 0.8596524131032758, "grad_norm": 0.9339787364006042, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13751, "tokens_per_second_per_gpu": 10189.45, "total_tokens": 1357655949 }, { "epoch": 0.859714928732183, "grad_norm": 0.908244252204895, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13752, "tokens_per_second_per_gpu": 10116.3, "total_tokens": 1357752340 }, { "epoch": 0.8597774443610903, "grad_norm": 0.8717268109321594, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13753, "tokens_per_second_per_gpu": 10465.41, "total_tokens": 1357852255 }, { "epoch": 0.8598399599899975, "grad_norm": 0.8711352348327637, "learning_rate": 2e-05, "loss": 0.5758, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13754, "tokens_per_second_per_gpu": 9931.78, "total_tokens": 1357947710 }, { "epoch": 0.8599024756189048, "grad_norm": 0.9179366230964661, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13755, "tokens_per_second_per_gpu": 10784.81, "total_tokens": 1358048765 }, { "epoch": 0.859964991247812, "grad_norm": 0.8889208436012268, "learning_rate": 2e-05, "loss": 0.592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13756, "tokens_per_second_per_gpu": 10430.09, "total_tokens": 1358146294 }, { "epoch": 0.8600275068767191, "grad_norm": 0.9141576886177063, "learning_rate": 2e-05, "loss": 0.5952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13757, "tokens_per_second_per_gpu": 10649.26, "total_tokens": 1358244936 }, { "epoch": 0.8600900225056264, "grad_norm": 0.9094994068145752, "learning_rate": 2e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13758, "tokens_per_second_per_gpu": 9890.04, "total_tokens": 1358338841 }, { "epoch": 0.8601525381345336, "grad_norm": 0.9610269069671631, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13759, "tokens_per_second_per_gpu": 11075.35, "total_tokens": 1358440333 }, { "epoch": 0.8602150537634409, "grad_norm": 0.8776191473007202, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13760, "tokens_per_second_per_gpu": 10680.89, "total_tokens": 1358541307 }, { "epoch": 0.8602775693923481, "grad_norm": 0.8936066627502441, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13761, "tokens_per_second_per_gpu": 11669.55, "total_tokens": 1358637321 }, { "epoch": 0.8603400850212554, "grad_norm": 0.8992181420326233, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13762, "tokens_per_second_per_gpu": 10237.36, "total_tokens": 1358736013 }, { "epoch": 0.8604026006501625, "grad_norm": 0.8999672532081604, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13763, "tokens_per_second_per_gpu": 10827.82, "total_tokens": 1358837340 }, { "epoch": 0.8604651162790697, "grad_norm": 0.9046410918235779, "learning_rate": 2e-05, "loss": 0.6434, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13764, "tokens_per_second_per_gpu": 11127.47, "total_tokens": 1358935718 }, { "epoch": 0.860527631907977, "grad_norm": 0.8996307253837585, "learning_rate": 2e-05, "loss": 0.5518, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13765, "tokens_per_second_per_gpu": 10085.5, "total_tokens": 1359029027 }, { "epoch": 0.8605901475368842, "grad_norm": 0.8638820052146912, "learning_rate": 2e-05, "loss": 0.625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13766, "tokens_per_second_per_gpu": 10643.35, "total_tokens": 1359130179 }, { "epoch": 0.8606526631657915, "grad_norm": 0.8691705465316772, "learning_rate": 2e-05, "loss": 0.5813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13767, "tokens_per_second_per_gpu": 10853.74, "total_tokens": 1359230116 }, { "epoch": 0.8607151787946987, "grad_norm": 0.8754107356071472, "learning_rate": 2e-05, "loss": 0.5907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13768, "tokens_per_second_per_gpu": 10454.59, "total_tokens": 1359328375 }, { "epoch": 0.8607776944236059, "grad_norm": 0.8851208686828613, "learning_rate": 2e-05, "loss": 0.6625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13769, "tokens_per_second_per_gpu": 11008.01, "total_tokens": 1359433408 }, { "epoch": 0.8608402100525131, "grad_norm": 0.9180008769035339, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13770, "tokens_per_second_per_gpu": 10622.57, "total_tokens": 1359530642 }, { "epoch": 0.8609027256814203, "grad_norm": 0.8728248476982117, "learning_rate": 2e-05, "loss": 0.583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13771, "tokens_per_second_per_gpu": 9902.11, "total_tokens": 1359628913 }, { "epoch": 0.8609652413103276, "grad_norm": 0.9081562161445618, "learning_rate": 2e-05, "loss": 0.6178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13772, "tokens_per_second_per_gpu": 10514.85, "total_tokens": 1359727935 }, { "epoch": 0.8610277569392348, "grad_norm": 0.8828142285346985, "learning_rate": 2e-05, "loss": 0.5835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13773, "tokens_per_second_per_gpu": 10411.55, "total_tokens": 1359824489 }, { "epoch": 0.8610902725681421, "grad_norm": 0.9070652723312378, "learning_rate": 2e-05, "loss": 0.6343, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13774, "tokens_per_second_per_gpu": 10424.57, "total_tokens": 1359921931 }, { "epoch": 0.8611527881970492, "grad_norm": 0.9303500652313232, "learning_rate": 2e-05, "loss": 0.6522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13775, "tokens_per_second_per_gpu": 10932.87, "total_tokens": 1360021909 }, { "epoch": 0.8612153038259565, "grad_norm": 0.8760564923286438, "learning_rate": 2e-05, "loss": 0.5949, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13776, "tokens_per_second_per_gpu": 10418.36, "total_tokens": 1360119539 }, { "epoch": 0.8612778194548637, "grad_norm": 0.8731273412704468, "learning_rate": 2e-05, "loss": 0.5817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13777, "tokens_per_second_per_gpu": 10602.83, "total_tokens": 1360218592 }, { "epoch": 0.861340335083771, "grad_norm": 0.8722694516181946, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13778, "tokens_per_second_per_gpu": 11094.58, "total_tokens": 1360320974 }, { "epoch": 0.8614028507126782, "grad_norm": 0.8683849573135376, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13779, "tokens_per_second_per_gpu": 11062.3, "total_tokens": 1360422569 }, { "epoch": 0.8614653663415854, "grad_norm": 0.9144240617752075, "learning_rate": 2e-05, "loss": 0.6044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13780, "tokens_per_second_per_gpu": 9171.33, "total_tokens": 1360516155 }, { "epoch": 0.8615278819704926, "grad_norm": 0.8962490558624268, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13781, "tokens_per_second_per_gpu": 10414.92, "total_tokens": 1360615518 }, { "epoch": 0.8615903975993998, "grad_norm": 0.8919053673744202, "learning_rate": 2e-05, "loss": 0.669, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13782, "tokens_per_second_per_gpu": 10631.51, "total_tokens": 1360713961 }, { "epoch": 0.8616529132283071, "grad_norm": 0.8599897623062134, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13783, "tokens_per_second_per_gpu": 11097.08, "total_tokens": 1360815206 }, { "epoch": 0.8617154288572143, "grad_norm": 0.8930612802505493, "learning_rate": 2e-05, "loss": 0.6464, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13784, "tokens_per_second_per_gpu": 10231.73, "total_tokens": 1360917529 }, { "epoch": 0.8617779444861215, "grad_norm": 0.8643513321876526, "learning_rate": 2e-05, "loss": 0.5745, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13785, "tokens_per_second_per_gpu": 10703.44, "total_tokens": 1361018432 }, { "epoch": 0.8618404601150288, "grad_norm": 0.8895429968833923, "learning_rate": 2e-05, "loss": 0.5951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13786, "tokens_per_second_per_gpu": 10268.82, "total_tokens": 1361114836 }, { "epoch": 0.8619029757439359, "grad_norm": 0.9008859992027283, "learning_rate": 2e-05, "loss": 0.6652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13787, "tokens_per_second_per_gpu": 10658.2, "total_tokens": 1361216438 }, { "epoch": 0.8619654913728432, "grad_norm": 0.8994156122207642, "learning_rate": 2e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13788, "tokens_per_second_per_gpu": 10088.83, "total_tokens": 1361317108 }, { "epoch": 0.8620280070017504, "grad_norm": 0.9103279709815979, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13789, "tokens_per_second_per_gpu": 10869.52, "total_tokens": 1361416479 }, { "epoch": 0.8620905226306577, "grad_norm": 0.9012070894241333, "learning_rate": 2e-05, "loss": 0.6565, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13790, "tokens_per_second_per_gpu": 10879.15, "total_tokens": 1361516793 }, { "epoch": 0.8621530382595649, "grad_norm": 0.8550344109535217, "learning_rate": 2e-05, "loss": 0.5962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13791, "tokens_per_second_per_gpu": 10487.46, "total_tokens": 1361617542 }, { "epoch": 0.8622155538884722, "grad_norm": 0.8480883240699768, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13792, "tokens_per_second_per_gpu": 10676.0, "total_tokens": 1361720021 }, { "epoch": 0.8622780695173794, "grad_norm": 0.8607614040374756, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13793, "tokens_per_second_per_gpu": 11309.07, "total_tokens": 1361823454 }, { "epoch": 0.8623405851462865, "grad_norm": 0.8592767119407654, "learning_rate": 2e-05, "loss": 0.6219, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13794, "tokens_per_second_per_gpu": 10433.8, "total_tokens": 1361924050 }, { "epoch": 0.8624031007751938, "grad_norm": 0.9213865399360657, "learning_rate": 2e-05, "loss": 0.6539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13795, "tokens_per_second_per_gpu": 11101.97, "total_tokens": 1362029187 }, { "epoch": 0.862465616404101, "grad_norm": 0.9276717305183411, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13796, "tokens_per_second_per_gpu": 10212.44, "total_tokens": 1362126812 }, { "epoch": 0.8625281320330083, "grad_norm": 0.8847355842590332, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13797, "tokens_per_second_per_gpu": 10088.36, "total_tokens": 1362224565 }, { "epoch": 0.8625906476619155, "grad_norm": 0.879684567451477, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13798, "tokens_per_second_per_gpu": 14518.66, "total_tokens": 1362323223 }, { "epoch": 0.8626531632908228, "grad_norm": 0.8976579308509827, "learning_rate": 2e-05, "loss": 0.5973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13799, "tokens_per_second_per_gpu": 9973.84, "total_tokens": 1362419662 }, { "epoch": 0.8627156789197299, "grad_norm": 0.9044094681739807, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13800, "tokens_per_second_per_gpu": 10122.25, "total_tokens": 1362517188 }, { "epoch": 0.8627781945486371, "grad_norm": 0.9156257510185242, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13801, "tokens_per_second_per_gpu": 10572.33, "total_tokens": 1362616149 }, { "epoch": 0.8628407101775444, "grad_norm": 0.8801537752151489, "learning_rate": 2e-05, "loss": 0.6195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13802, "tokens_per_second_per_gpu": 9967.97, "total_tokens": 1362715760 }, { "epoch": 0.8629032258064516, "grad_norm": 0.862353503704071, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13803, "tokens_per_second_per_gpu": 11037.91, "total_tokens": 1362817339 }, { "epoch": 0.8629657414353589, "grad_norm": 0.8937323689460754, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13804, "tokens_per_second_per_gpu": 10751.49, "total_tokens": 1362917759 }, { "epoch": 0.8630282570642661, "grad_norm": 0.8642858266830444, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13805, "tokens_per_second_per_gpu": 10463.32, "total_tokens": 1363022799 }, { "epoch": 0.8630907726931732, "grad_norm": 0.8758110404014587, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13806, "tokens_per_second_per_gpu": 10681.99, "total_tokens": 1363125392 }, { "epoch": 0.8631532883220805, "grad_norm": 0.8511325716972351, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13807, "tokens_per_second_per_gpu": 10947.09, "total_tokens": 1363226906 }, { "epoch": 0.8632158039509877, "grad_norm": 0.8632297515869141, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13808, "tokens_per_second_per_gpu": 11094.35, "total_tokens": 1363325945 }, { "epoch": 0.863278319579895, "grad_norm": 0.9012274742126465, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13809, "tokens_per_second_per_gpu": 10522.96, "total_tokens": 1363427406 }, { "epoch": 0.8633408352088022, "grad_norm": 0.8938800692558289, "learning_rate": 2e-05, "loss": 0.5736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13810, "tokens_per_second_per_gpu": 9850.11, "total_tokens": 1363521552 }, { "epoch": 0.8634033508377095, "grad_norm": 0.9219911694526672, "learning_rate": 2e-05, "loss": 0.6433, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13811, "tokens_per_second_per_gpu": 10597.33, "total_tokens": 1363623234 }, { "epoch": 0.8634658664666166, "grad_norm": 0.8668802976608276, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13812, "tokens_per_second_per_gpu": 9593.32, "total_tokens": 1363722032 }, { "epoch": 0.8635283820955238, "grad_norm": 0.8953104019165039, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13813, "tokens_per_second_per_gpu": 11156.42, "total_tokens": 1363826159 }, { "epoch": 0.8635908977244311, "grad_norm": 0.8911019563674927, "learning_rate": 2e-05, "loss": 0.658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13814, "tokens_per_second_per_gpu": 10934.12, "total_tokens": 1363928604 }, { "epoch": 0.8636534133533383, "grad_norm": 0.8886027336120605, "learning_rate": 2e-05, "loss": 0.5812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13815, "tokens_per_second_per_gpu": 10573.42, "total_tokens": 1364028542 }, { "epoch": 0.8637159289822456, "grad_norm": 0.887822151184082, "learning_rate": 2e-05, "loss": 0.5972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13816, "tokens_per_second_per_gpu": 10384.71, "total_tokens": 1364127376 }, { "epoch": 0.8637784446111528, "grad_norm": 0.8714884519577026, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13817, "tokens_per_second_per_gpu": 10656.25, "total_tokens": 1364230860 }, { "epoch": 0.86384096024006, "grad_norm": 0.8882405161857605, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13818, "tokens_per_second_per_gpu": 10637.32, "total_tokens": 1364331541 }, { "epoch": 0.8639034758689672, "grad_norm": 0.8876622915267944, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13819, "tokens_per_second_per_gpu": 9998.1, "total_tokens": 1364429289 }, { "epoch": 0.8639659914978745, "grad_norm": 0.8902413845062256, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13820, "tokens_per_second_per_gpu": 10652.27, "total_tokens": 1364527666 }, { "epoch": 0.8640285071267817, "grad_norm": 0.8821817636489868, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13821, "tokens_per_second_per_gpu": 10904.74, "total_tokens": 1364628896 }, { "epoch": 0.8640910227556889, "grad_norm": 0.8999561667442322, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13822, "tokens_per_second_per_gpu": 10216.23, "total_tokens": 1364726222 }, { "epoch": 0.8641535383845962, "grad_norm": 0.8931401371955872, "learning_rate": 2e-05, "loss": 0.5865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13823, "tokens_per_second_per_gpu": 10410.78, "total_tokens": 1364825698 }, { "epoch": 0.8642160540135033, "grad_norm": 0.961073637008667, "learning_rate": 2e-05, "loss": 0.6459, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13824, "tokens_per_second_per_gpu": 9580.47, "total_tokens": 1364918917 }, { "epoch": 0.8642785696424106, "grad_norm": 0.8757419586181641, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13825, "tokens_per_second_per_gpu": 10606.03, "total_tokens": 1365018135 }, { "epoch": 0.8643410852713178, "grad_norm": 0.8605726957321167, "learning_rate": 2e-05, "loss": 0.593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13826, "tokens_per_second_per_gpu": 10387.97, "total_tokens": 1365118192 }, { "epoch": 0.864403600900225, "grad_norm": 0.8775597810745239, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13827, "tokens_per_second_per_gpu": 10494.39, "total_tokens": 1365219444 }, { "epoch": 0.8644661165291323, "grad_norm": 0.8911173343658447, "learning_rate": 2e-05, "loss": 0.5904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13828, "tokens_per_second_per_gpu": 10413.1, "total_tokens": 1365318545 }, { "epoch": 0.8645286321580395, "grad_norm": 0.9118210077285767, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13829, "tokens_per_second_per_gpu": 10515.66, "total_tokens": 1365418246 }, { "epoch": 0.8645911477869468, "grad_norm": 0.8751228451728821, "learning_rate": 2e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13830, "tokens_per_second_per_gpu": 10993.37, "total_tokens": 1365518979 }, { "epoch": 0.8646536634158539, "grad_norm": 0.8920634388923645, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13831, "tokens_per_second_per_gpu": 10487.75, "total_tokens": 1365616704 }, { "epoch": 0.8647161790447612, "grad_norm": 0.8894487023353577, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13832, "tokens_per_second_per_gpu": 10829.1, "total_tokens": 1365717077 }, { "epoch": 0.8647786946736684, "grad_norm": 0.8797053098678589, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13833, "tokens_per_second_per_gpu": 11016.97, "total_tokens": 1365819625 }, { "epoch": 0.8648412103025757, "grad_norm": 0.8864859342575073, "learning_rate": 2e-05, "loss": 0.6267, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13834, "tokens_per_second_per_gpu": 10930.93, "total_tokens": 1365922607 }, { "epoch": 0.8649037259314829, "grad_norm": 0.9043905735015869, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13835, "tokens_per_second_per_gpu": 10014.07, "total_tokens": 1366018951 }, { "epoch": 0.8649662415603901, "grad_norm": 0.915305495262146, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13836, "tokens_per_second_per_gpu": 10751.03, "total_tokens": 1366121288 }, { "epoch": 0.8650287571892973, "grad_norm": 0.8892737030982971, "learning_rate": 2e-05, "loss": 0.6056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13837, "tokens_per_second_per_gpu": 10055.73, "total_tokens": 1366220913 }, { "epoch": 0.8650912728182045, "grad_norm": 0.872706949710846, "learning_rate": 2e-05, "loss": 0.588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13838, "tokens_per_second_per_gpu": 10478.78, "total_tokens": 1366317725 }, { "epoch": 0.8651537884471118, "grad_norm": 0.9023489952087402, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13839, "tokens_per_second_per_gpu": 10640.88, "total_tokens": 1366415641 }, { "epoch": 0.865216304076019, "grad_norm": 0.9098963141441345, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13840, "tokens_per_second_per_gpu": 10141.06, "total_tokens": 1366512718 }, { "epoch": 0.8652788197049263, "grad_norm": 0.9063819646835327, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13841, "tokens_per_second_per_gpu": 10257.88, "total_tokens": 1366609986 }, { "epoch": 0.8653413353338335, "grad_norm": 0.8874498009681702, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13842, "tokens_per_second_per_gpu": 10428.04, "total_tokens": 1366710876 }, { "epoch": 0.8654038509627406, "grad_norm": 0.9070665836334229, "learning_rate": 2e-05, "loss": 0.5875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13843, "tokens_per_second_per_gpu": 9952.81, "total_tokens": 1366805386 }, { "epoch": 0.8654663665916479, "grad_norm": 0.8819438219070435, "learning_rate": 2e-05, "loss": 0.6297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13844, "tokens_per_second_per_gpu": 11044.05, "total_tokens": 1366909044 }, { "epoch": 0.8655288822205551, "grad_norm": 0.8961368799209595, "learning_rate": 2e-05, "loss": 0.6558, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13845, "tokens_per_second_per_gpu": 10004.99, "total_tokens": 1367009230 }, { "epoch": 0.8655913978494624, "grad_norm": 0.8997392654418945, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13846, "tokens_per_second_per_gpu": 10873.17, "total_tokens": 1367110278 }, { "epoch": 0.8656539134783696, "grad_norm": 0.9098605513572693, "learning_rate": 2e-05, "loss": 0.5585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13847, "tokens_per_second_per_gpu": 9814.35, "total_tokens": 1367204353 }, { "epoch": 0.8657164291072769, "grad_norm": 0.8748741745948792, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13848, "tokens_per_second_per_gpu": 10878.22, "total_tokens": 1367305900 }, { "epoch": 0.865778944736184, "grad_norm": 0.8950906991958618, "learning_rate": 2e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13849, "tokens_per_second_per_gpu": 10318.04, "total_tokens": 1367405566 }, { "epoch": 0.8658414603650912, "grad_norm": 0.9206117987632751, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13850, "tokens_per_second_per_gpu": 9608.63, "total_tokens": 1367504297 }, { "epoch": 0.8659039759939985, "grad_norm": 0.9011545777320862, "learning_rate": 2e-05, "loss": 0.6447, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13851, "tokens_per_second_per_gpu": 10821.42, "total_tokens": 1367607311 }, { "epoch": 0.8659664916229057, "grad_norm": 0.8416211009025574, "learning_rate": 2e-05, "loss": 0.5653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13852, "tokens_per_second_per_gpu": 10638.69, "total_tokens": 1367706864 }, { "epoch": 0.866029007251813, "grad_norm": 0.8873695731163025, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13853, "tokens_per_second_per_gpu": 10923.97, "total_tokens": 1367809214 }, { "epoch": 0.8660915228807202, "grad_norm": 0.8790813088417053, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13854, "tokens_per_second_per_gpu": 10998.17, "total_tokens": 1367909267 }, { "epoch": 0.8661540385096274, "grad_norm": 0.8873053193092346, "learning_rate": 2e-05, "loss": 0.6082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13855, "tokens_per_second_per_gpu": 10272.63, "total_tokens": 1368007391 }, { "epoch": 0.8662165541385346, "grad_norm": 0.8780272603034973, "learning_rate": 2e-05, "loss": 0.6211, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13856, "tokens_per_second_per_gpu": 10652.48, "total_tokens": 1368105802 }, { "epoch": 0.8662790697674418, "grad_norm": 0.9046192169189453, "learning_rate": 2e-05, "loss": 0.6519, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13857, "tokens_per_second_per_gpu": 10628.48, "total_tokens": 1368201194 }, { "epoch": 0.8663415853963491, "grad_norm": 0.9178924560546875, "learning_rate": 2e-05, "loss": 0.6567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13858, "tokens_per_second_per_gpu": 10757.24, "total_tokens": 1368301019 }, { "epoch": 0.8664041010252563, "grad_norm": 0.8724814653396606, "learning_rate": 2e-05, "loss": 0.5926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13859, "tokens_per_second_per_gpu": 10825.92, "total_tokens": 1368398582 }, { "epoch": 0.8664666166541636, "grad_norm": 0.8720682859420776, "learning_rate": 2e-05, "loss": 0.5822, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13860, "tokens_per_second_per_gpu": 10094.6, "total_tokens": 1368493809 }, { "epoch": 0.8665291322830707, "grad_norm": 0.906460165977478, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13861, "tokens_per_second_per_gpu": 10776.45, "total_tokens": 1368592664 }, { "epoch": 0.866591647911978, "grad_norm": 0.8664929866790771, "learning_rate": 2e-05, "loss": 0.5783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13862, "tokens_per_second_per_gpu": 9359.91, "total_tokens": 1368688657 }, { "epoch": 0.8666541635408852, "grad_norm": 0.8631518483161926, "learning_rate": 2e-05, "loss": 0.5793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13863, "tokens_per_second_per_gpu": 9680.27, "total_tokens": 1368783692 }, { "epoch": 0.8667166791697924, "grad_norm": 0.9384978413581848, "learning_rate": 2e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13864, "tokens_per_second_per_gpu": 10261.79, "total_tokens": 1368880964 }, { "epoch": 0.8667791947986997, "grad_norm": 0.8883556723594666, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13865, "tokens_per_second_per_gpu": 10556.79, "total_tokens": 1368981680 }, { "epoch": 0.8668417104276069, "grad_norm": 0.886012613773346, "learning_rate": 2e-05, "loss": 0.6205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13866, "tokens_per_second_per_gpu": 10364.53, "total_tokens": 1369080214 }, { "epoch": 0.8669042260565142, "grad_norm": 0.8722330331802368, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13867, "tokens_per_second_per_gpu": 10092.72, "total_tokens": 1369179127 }, { "epoch": 0.8669667416854213, "grad_norm": 0.8736047148704529, "learning_rate": 2e-05, "loss": 0.5995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13868, "tokens_per_second_per_gpu": 10656.52, "total_tokens": 1369275969 }, { "epoch": 0.8670292573143286, "grad_norm": 0.876225471496582, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13869, "tokens_per_second_per_gpu": 10267.7, "total_tokens": 1369376517 }, { "epoch": 0.8670917729432358, "grad_norm": 0.8946043252944946, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13870, "tokens_per_second_per_gpu": 9763.9, "total_tokens": 1369472406 }, { "epoch": 0.867154288572143, "grad_norm": 0.867607057094574, "learning_rate": 2e-05, "loss": 0.603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13871, "tokens_per_second_per_gpu": 10579.93, "total_tokens": 1369571730 }, { "epoch": 0.8672168042010503, "grad_norm": 0.9094706177711487, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13872, "tokens_per_second_per_gpu": 9534.44, "total_tokens": 1369669069 }, { "epoch": 0.8672793198299575, "grad_norm": 0.9093189835548401, "learning_rate": 2e-05, "loss": 0.568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13873, "tokens_per_second_per_gpu": 11147.5, "total_tokens": 1369765281 }, { "epoch": 0.8673418354588647, "grad_norm": 0.8844821453094482, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13874, "tokens_per_second_per_gpu": 11112.05, "total_tokens": 1369869237 }, { "epoch": 0.8674043510877719, "grad_norm": 0.8958286046981812, "learning_rate": 2e-05, "loss": 0.5839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13875, "tokens_per_second_per_gpu": 10246.19, "total_tokens": 1369963145 }, { "epoch": 0.8674668667166792, "grad_norm": 0.8776029944419861, "learning_rate": 2e-05, "loss": 0.5747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13876, "tokens_per_second_per_gpu": 10186.69, "total_tokens": 1370060430 }, { "epoch": 0.8675293823455864, "grad_norm": 0.8636817932128906, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13877, "tokens_per_second_per_gpu": 10298.49, "total_tokens": 1370157963 }, { "epoch": 0.8675918979744937, "grad_norm": 0.8616158366203308, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13878, "tokens_per_second_per_gpu": 11105.42, "total_tokens": 1370261831 }, { "epoch": 0.8676544136034009, "grad_norm": 0.8638023138046265, "learning_rate": 2e-05, "loss": 0.5928, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13879, "tokens_per_second_per_gpu": 10115.76, "total_tokens": 1370359449 }, { "epoch": 0.867716929232308, "grad_norm": 0.8678733110427856, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13880, "tokens_per_second_per_gpu": 10039.39, "total_tokens": 1370458590 }, { "epoch": 0.8677794448612153, "grad_norm": 0.8975763320922852, "learning_rate": 2e-05, "loss": 0.5923, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13881, "tokens_per_second_per_gpu": 9946.58, "total_tokens": 1370557006 }, { "epoch": 0.8678419604901225, "grad_norm": 0.8697637915611267, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13882, "tokens_per_second_per_gpu": 10103.08, "total_tokens": 1370655580 }, { "epoch": 0.8679044761190298, "grad_norm": 0.8653253316879272, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13883, "tokens_per_second_per_gpu": 10483.13, "total_tokens": 1370754380 }, { "epoch": 0.867966991747937, "grad_norm": 0.9013829827308655, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13884, "tokens_per_second_per_gpu": 10915.94, "total_tokens": 1370852776 }, { "epoch": 0.8680295073768443, "grad_norm": 0.8946645259857178, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13885, "tokens_per_second_per_gpu": 9861.72, "total_tokens": 1370950459 }, { "epoch": 0.8680920230057514, "grad_norm": 0.8740879893302917, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13886, "tokens_per_second_per_gpu": 10563.04, "total_tokens": 1371045502 }, { "epoch": 0.8681545386346586, "grad_norm": 0.897247850894928, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13887, "tokens_per_second_per_gpu": 10059.15, "total_tokens": 1371144160 }, { "epoch": 0.8682170542635659, "grad_norm": 0.8815425038337708, "learning_rate": 2e-05, "loss": 0.6022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13888, "tokens_per_second_per_gpu": 10390.89, "total_tokens": 1371240180 }, { "epoch": 0.8682795698924731, "grad_norm": 0.8916841745376587, "learning_rate": 2e-05, "loss": 0.5859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13889, "tokens_per_second_per_gpu": 10729.61, "total_tokens": 1371337987 }, { "epoch": 0.8683420855213804, "grad_norm": 0.8774313926696777, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13890, "tokens_per_second_per_gpu": 11522.65, "total_tokens": 1371441615 }, { "epoch": 0.8684046011502876, "grad_norm": 0.9071101546287537, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13891, "tokens_per_second_per_gpu": 9697.41, "total_tokens": 1371536966 }, { "epoch": 0.8684671167791947, "grad_norm": 0.8842926621437073, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13892, "tokens_per_second_per_gpu": 10647.35, "total_tokens": 1371637032 }, { "epoch": 0.868529632408102, "grad_norm": 0.88885897397995, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13893, "tokens_per_second_per_gpu": 10607.01, "total_tokens": 1371739949 }, { "epoch": 0.8685921480370092, "grad_norm": 0.9314689040184021, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13894, "tokens_per_second_per_gpu": 10025.01, "total_tokens": 1371836052 }, { "epoch": 0.8686546636659165, "grad_norm": 0.8831226229667664, "learning_rate": 2e-05, "loss": 0.6009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13895, "tokens_per_second_per_gpu": 10879.04, "total_tokens": 1371935329 }, { "epoch": 0.8687171792948237, "grad_norm": 0.887249767780304, "learning_rate": 2e-05, "loss": 0.5933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13896, "tokens_per_second_per_gpu": 10693.74, "total_tokens": 1372032862 }, { "epoch": 0.868779694923731, "grad_norm": 0.8666689395904541, "learning_rate": 2e-05, "loss": 0.5747, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13897, "tokens_per_second_per_gpu": 10021.41, "total_tokens": 1372130254 }, { "epoch": 0.8688422105526381, "grad_norm": 0.88551926612854, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13898, "tokens_per_second_per_gpu": 10212.15, "total_tokens": 1372229208 }, { "epoch": 0.8689047261815454, "grad_norm": 0.8645411729812622, "learning_rate": 2e-05, "loss": 0.6246, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13899, "tokens_per_second_per_gpu": 10455.2, "total_tokens": 1372329364 }, { "epoch": 0.8689672418104526, "grad_norm": 0.8667671084403992, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13900, "tokens_per_second_per_gpu": 11505.94, "total_tokens": 1372433846 }, { "epoch": 0.8690297574393598, "grad_norm": 0.8837283849716187, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13901, "tokens_per_second_per_gpu": 9942.25, "total_tokens": 1372533024 }, { "epoch": 0.8690922730682671, "grad_norm": 0.8913776874542236, "learning_rate": 2e-05, "loss": 0.653, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13902, "tokens_per_second_per_gpu": 11513.54, "total_tokens": 1372634088 }, { "epoch": 0.8691547886971743, "grad_norm": 0.852181077003479, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13903, "tokens_per_second_per_gpu": 10850.3, "total_tokens": 1372736081 }, { "epoch": 0.8692173043260816, "grad_norm": 0.8937325477600098, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13904, "tokens_per_second_per_gpu": 10449.8, "total_tokens": 1372834321 }, { "epoch": 0.8692798199549887, "grad_norm": 0.8830609917640686, "learning_rate": 2e-05, "loss": 0.5581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13905, "tokens_per_second_per_gpu": 9552.27, "total_tokens": 1372929171 }, { "epoch": 0.869342335583896, "grad_norm": 0.8855904936790466, "learning_rate": 2e-05, "loss": 0.6466, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13906, "tokens_per_second_per_gpu": 9963.41, "total_tokens": 1373028548 }, { "epoch": 0.8694048512128032, "grad_norm": 0.8767973780632019, "learning_rate": 2e-05, "loss": 0.6614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13907, "tokens_per_second_per_gpu": 9934.52, "total_tokens": 1373128663 }, { "epoch": 0.8694673668417104, "grad_norm": 0.898081362247467, "learning_rate": 2e-05, "loss": 0.5987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13908, "tokens_per_second_per_gpu": 10525.77, "total_tokens": 1373225440 }, { "epoch": 0.8695298824706177, "grad_norm": 0.8755658268928528, "learning_rate": 2e-05, "loss": 0.5854, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13909, "tokens_per_second_per_gpu": 9709.94, "total_tokens": 1373319579 }, { "epoch": 0.8695923980995249, "grad_norm": 0.8901740312576294, "learning_rate": 2e-05, "loss": 0.5874, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13910, "tokens_per_second_per_gpu": 9901.24, "total_tokens": 1373417381 }, { "epoch": 0.8696549137284321, "grad_norm": 0.8578791618347168, "learning_rate": 2e-05, "loss": 0.5768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13911, "tokens_per_second_per_gpu": 11259.43, "total_tokens": 1373514704 }, { "epoch": 0.8697174293573393, "grad_norm": 0.8743325471878052, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13912, "tokens_per_second_per_gpu": 10619.97, "total_tokens": 1373613571 }, { "epoch": 0.8697799449862466, "grad_norm": 0.8682973384857178, "learning_rate": 2e-05, "loss": 0.5887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13913, "tokens_per_second_per_gpu": 11078.48, "total_tokens": 1373710880 }, { "epoch": 0.8698424606151538, "grad_norm": 0.9487606883049011, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13914, "tokens_per_second_per_gpu": 10353.23, "total_tokens": 1373805900 }, { "epoch": 0.869904976244061, "grad_norm": 0.9012715220451355, "learning_rate": 2e-05, "loss": 0.6014, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13915, "tokens_per_second_per_gpu": 11324.58, "total_tokens": 1373903556 }, { "epoch": 0.8699674918729683, "grad_norm": 0.9395919442176819, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13916, "tokens_per_second_per_gpu": 9838.29, "total_tokens": 1374001353 }, { "epoch": 0.8700300075018754, "grad_norm": 0.886969268321991, "learning_rate": 2e-05, "loss": 0.6011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13917, "tokens_per_second_per_gpu": 9543.71, "total_tokens": 1374099007 }, { "epoch": 0.8700925231307827, "grad_norm": 0.8741400837898254, "learning_rate": 2e-05, "loss": 0.5911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13918, "tokens_per_second_per_gpu": 10457.32, "total_tokens": 1374198832 }, { "epoch": 0.8701550387596899, "grad_norm": 0.8430436253547668, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13919, "tokens_per_second_per_gpu": 11001.29, "total_tokens": 1374300152 }, { "epoch": 0.8702175543885972, "grad_norm": 0.9244102239608765, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13920, "tokens_per_second_per_gpu": 10716.35, "total_tokens": 1374396918 }, { "epoch": 0.8702800700175044, "grad_norm": 0.8924983143806458, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13921, "tokens_per_second_per_gpu": 10395.42, "total_tokens": 1374493877 }, { "epoch": 0.8703425856464116, "grad_norm": 0.898546576499939, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13922, "tokens_per_second_per_gpu": 10930.33, "total_tokens": 1374595068 }, { "epoch": 0.8704051012753188, "grad_norm": 0.9406299591064453, "learning_rate": 2e-05, "loss": 0.5862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13923, "tokens_per_second_per_gpu": 9587.69, "total_tokens": 1374686302 }, { "epoch": 0.870467616904226, "grad_norm": 0.947129487991333, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13924, "tokens_per_second_per_gpu": 9832.99, "total_tokens": 1374785512 }, { "epoch": 0.8705301325331333, "grad_norm": 0.8844559192657471, "learning_rate": 2e-05, "loss": 0.5973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13925, "tokens_per_second_per_gpu": 10032.48, "total_tokens": 1374883338 }, { "epoch": 0.8705926481620405, "grad_norm": 0.893928587436676, "learning_rate": 2e-05, "loss": 0.6676, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13926, "tokens_per_second_per_gpu": 10852.29, "total_tokens": 1374983906 }, { "epoch": 0.8706551637909478, "grad_norm": 0.8719022274017334, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13927, "tokens_per_second_per_gpu": 10884.88, "total_tokens": 1375085539 }, { "epoch": 0.870717679419855, "grad_norm": 0.9783727526664734, "learning_rate": 2e-05, "loss": 0.6472, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13928, "tokens_per_second_per_gpu": 11141.86, "total_tokens": 1375189071 }, { "epoch": 0.8707801950487621, "grad_norm": 0.8710824847221375, "learning_rate": 2e-05, "loss": 0.5711, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13929, "tokens_per_second_per_gpu": 10884.58, "total_tokens": 1375289015 }, { "epoch": 0.8708427106776694, "grad_norm": 0.8850244283676147, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13930, "tokens_per_second_per_gpu": 10146.96, "total_tokens": 1375386454 }, { "epoch": 0.8709052263065766, "grad_norm": 0.8863054513931274, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13931, "tokens_per_second_per_gpu": 10983.32, "total_tokens": 1375490601 }, { "epoch": 0.8709677419354839, "grad_norm": 0.8607070446014404, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13932, "tokens_per_second_per_gpu": 10471.71, "total_tokens": 1375590252 }, { "epoch": 0.8710302575643911, "grad_norm": 0.8857817053794861, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13933, "tokens_per_second_per_gpu": 11622.54, "total_tokens": 1375690462 }, { "epoch": 0.8710927731932984, "grad_norm": 0.9048044681549072, "learning_rate": 2e-05, "loss": 0.6702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13934, "tokens_per_second_per_gpu": 11218.53, "total_tokens": 1375795108 }, { "epoch": 0.8711552888222055, "grad_norm": 0.8992049098014832, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13935, "tokens_per_second_per_gpu": 10282.44, "total_tokens": 1375896253 }, { "epoch": 0.8712178044511127, "grad_norm": 0.8783748149871826, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13936, "tokens_per_second_per_gpu": 10888.83, "total_tokens": 1375995648 }, { "epoch": 0.87128032008002, "grad_norm": 0.9017643928527832, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13937, "tokens_per_second_per_gpu": 10430.09, "total_tokens": 1376096264 }, { "epoch": 0.8713428357089272, "grad_norm": 0.865540623664856, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13938, "tokens_per_second_per_gpu": 10552.77, "total_tokens": 1376196527 }, { "epoch": 0.8714053513378345, "grad_norm": 0.8909680843353271, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13939, "tokens_per_second_per_gpu": 10558.93, "total_tokens": 1376296680 }, { "epoch": 0.8714678669667417, "grad_norm": 0.8935245871543884, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13940, "tokens_per_second_per_gpu": 11409.17, "total_tokens": 1376396990 }, { "epoch": 0.871530382595649, "grad_norm": 0.9136154055595398, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13941, "tokens_per_second_per_gpu": 11487.13, "total_tokens": 1376497073 }, { "epoch": 0.8715928982245561, "grad_norm": 0.90201336145401, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13942, "tokens_per_second_per_gpu": 10582.12, "total_tokens": 1376595344 }, { "epoch": 0.8716554138534633, "grad_norm": 0.8961563110351562, "learning_rate": 2e-05, "loss": 0.5897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13943, "tokens_per_second_per_gpu": 10187.76, "total_tokens": 1376691951 }, { "epoch": 0.8717179294823706, "grad_norm": 0.9399213194847107, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13944, "tokens_per_second_per_gpu": 9888.62, "total_tokens": 1376786297 }, { "epoch": 0.8717804451112778, "grad_norm": 0.8648030757904053, "learning_rate": 2e-05, "loss": 0.5947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13945, "tokens_per_second_per_gpu": 10352.4, "total_tokens": 1376884958 }, { "epoch": 0.8718429607401851, "grad_norm": 0.8988590240478516, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13946, "tokens_per_second_per_gpu": 10409.4, "total_tokens": 1376985870 }, { "epoch": 0.8719054763690923, "grad_norm": 0.8918653130531311, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13947, "tokens_per_second_per_gpu": 10566.5, "total_tokens": 1377084182 }, { "epoch": 0.8719679919979995, "grad_norm": 0.9168916344642639, "learning_rate": 2e-05, "loss": 0.5979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13948, "tokens_per_second_per_gpu": 9308.86, "total_tokens": 1377178690 }, { "epoch": 0.8720305076269067, "grad_norm": 0.8785223364830017, "learning_rate": 2e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13949, "tokens_per_second_per_gpu": 10693.85, "total_tokens": 1377280575 }, { "epoch": 0.872093023255814, "grad_norm": 0.8731186389923096, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13950, "tokens_per_second_per_gpu": 10338.94, "total_tokens": 1377382272 }, { "epoch": 0.8721555388847212, "grad_norm": 0.8864127993583679, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13951, "tokens_per_second_per_gpu": 10797.32, "total_tokens": 1377481733 }, { "epoch": 0.8722180545136284, "grad_norm": 0.8690813779830933, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13952, "tokens_per_second_per_gpu": 10814.47, "total_tokens": 1377582297 }, { "epoch": 0.8722805701425357, "grad_norm": 0.8885089159011841, "learning_rate": 2e-05, "loss": 0.5807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13953, "tokens_per_second_per_gpu": 10489.89, "total_tokens": 1377679397 }, { "epoch": 0.8723430857714428, "grad_norm": 0.8734980821609497, "learning_rate": 2e-05, "loss": 0.5951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13954, "tokens_per_second_per_gpu": 10336.97, "total_tokens": 1377777929 }, { "epoch": 0.8724056014003501, "grad_norm": 0.8907683491706848, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13955, "tokens_per_second_per_gpu": 9917.86, "total_tokens": 1377877779 }, { "epoch": 0.8724681170292573, "grad_norm": 0.888896644115448, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13956, "tokens_per_second_per_gpu": 10332.63, "total_tokens": 1377973405 }, { "epoch": 0.8725306326581646, "grad_norm": 0.8925977945327759, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13957, "tokens_per_second_per_gpu": 11252.79, "total_tokens": 1378070430 }, { "epoch": 0.8725931482870718, "grad_norm": 0.9119559526443481, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13958, "tokens_per_second_per_gpu": 10967.91, "total_tokens": 1378172166 }, { "epoch": 0.872655663915979, "grad_norm": 0.8837625980377197, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13959, "tokens_per_second_per_gpu": 10240.38, "total_tokens": 1378271414 }, { "epoch": 0.8727181795448862, "grad_norm": 0.9577794671058655, "learning_rate": 2e-05, "loss": 0.6011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13960, "tokens_per_second_per_gpu": 10809.4, "total_tokens": 1378370082 }, { "epoch": 0.8727806951737934, "grad_norm": 0.8439260125160217, "learning_rate": 2e-05, "loss": 0.5508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13961, "tokens_per_second_per_gpu": 10361.92, "total_tokens": 1378468284 }, { "epoch": 0.8728432108027007, "grad_norm": 0.8927034735679626, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13962, "tokens_per_second_per_gpu": 10998.02, "total_tokens": 1378571912 }, { "epoch": 0.8729057264316079, "grad_norm": 0.8686504364013672, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13963, "tokens_per_second_per_gpu": 10372.23, "total_tokens": 1378670961 }, { "epoch": 0.8729682420605152, "grad_norm": 0.8854312896728516, "learning_rate": 2e-05, "loss": 0.5764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13964, "tokens_per_second_per_gpu": 9920.59, "total_tokens": 1378764563 }, { "epoch": 0.8730307576894224, "grad_norm": 0.9295897483825684, "learning_rate": 2e-05, "loss": 0.6615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13965, "tokens_per_second_per_gpu": 10765.55, "total_tokens": 1378862966 }, { "epoch": 0.8730932733183295, "grad_norm": 0.9551082849502563, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13966, "tokens_per_second_per_gpu": 10158.5, "total_tokens": 1378960067 }, { "epoch": 0.8731557889472368, "grad_norm": 0.8807311654090881, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13967, "tokens_per_second_per_gpu": 9919.24, "total_tokens": 1379057786 }, { "epoch": 0.873218304576144, "grad_norm": 0.8929738998413086, "learning_rate": 2e-05, "loss": 0.5963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13968, "tokens_per_second_per_gpu": 10469.65, "total_tokens": 1379156348 }, { "epoch": 0.8732808202050513, "grad_norm": 0.869583249092102, "learning_rate": 2e-05, "loss": 0.6598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13969, "tokens_per_second_per_gpu": 10256.58, "total_tokens": 1379259703 }, { "epoch": 0.8733433358339585, "grad_norm": 0.9099989533424377, "learning_rate": 2e-05, "loss": 0.6317, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13970, "tokens_per_second_per_gpu": 10361.71, "total_tokens": 1379360246 }, { "epoch": 0.8734058514628658, "grad_norm": 0.8903287649154663, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13971, "tokens_per_second_per_gpu": 10734.04, "total_tokens": 1379460572 }, { "epoch": 0.8734683670917729, "grad_norm": 0.9205528497695923, "learning_rate": 2e-05, "loss": 0.6278, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13972, "tokens_per_second_per_gpu": 11485.8, "total_tokens": 1379563628 }, { "epoch": 0.8735308827206801, "grad_norm": 0.8965094685554504, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13973, "tokens_per_second_per_gpu": 10255.81, "total_tokens": 1379661050 }, { "epoch": 0.8735933983495874, "grad_norm": 0.8732335567474365, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13974, "tokens_per_second_per_gpu": 10505.69, "total_tokens": 1379762595 }, { "epoch": 0.8736559139784946, "grad_norm": 0.9240405559539795, "learning_rate": 2e-05, "loss": 0.6594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13975, "tokens_per_second_per_gpu": 9976.75, "total_tokens": 1379859054 }, { "epoch": 0.8737184296074019, "grad_norm": 0.8889265656471252, "learning_rate": 2e-05, "loss": 0.5939, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13976, "tokens_per_second_per_gpu": 10271.46, "total_tokens": 1379958147 }, { "epoch": 0.8737809452363091, "grad_norm": 0.882742702960968, "learning_rate": 2e-05, "loss": 0.6756, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13977, "tokens_per_second_per_gpu": 10636.52, "total_tokens": 1380060019 }, { "epoch": 0.8738434608652164, "grad_norm": 0.8873242139816284, "learning_rate": 2e-05, "loss": 0.5843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13978, "tokens_per_second_per_gpu": 10644.5, "total_tokens": 1380161186 }, { "epoch": 0.8739059764941235, "grad_norm": 0.8978060483932495, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13979, "tokens_per_second_per_gpu": 10426.25, "total_tokens": 1380257647 }, { "epoch": 0.8739684921230307, "grad_norm": 0.8994836807250977, "learning_rate": 2e-05, "loss": 0.6003, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13980, "tokens_per_second_per_gpu": 10825.03, "total_tokens": 1380356661 }, { "epoch": 0.874031007751938, "grad_norm": 0.9294446110725403, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13981, "tokens_per_second_per_gpu": 10531.04, "total_tokens": 1380458484 }, { "epoch": 0.8740935233808452, "grad_norm": 0.8941532373428345, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13982, "tokens_per_second_per_gpu": 10550.79, "total_tokens": 1380555932 }, { "epoch": 0.8741560390097525, "grad_norm": 0.8914920091629028, "learning_rate": 2e-05, "loss": 0.6172, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13983, "tokens_per_second_per_gpu": 10265.23, "total_tokens": 1380655616 }, { "epoch": 0.8742185546386597, "grad_norm": 0.8703743815422058, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13984, "tokens_per_second_per_gpu": 10511.5, "total_tokens": 1380752652 }, { "epoch": 0.8742810702675669, "grad_norm": 0.866625189781189, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13985, "tokens_per_second_per_gpu": 10792.44, "total_tokens": 1380855161 }, { "epoch": 0.8743435858964741, "grad_norm": 0.9191141128540039, "learning_rate": 2e-05, "loss": 0.5988, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13986, "tokens_per_second_per_gpu": 11159.62, "total_tokens": 1380952760 }, { "epoch": 0.8744061015253813, "grad_norm": 0.8949463367462158, "learning_rate": 2e-05, "loss": 0.5908, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13987, "tokens_per_second_per_gpu": 10537.38, "total_tokens": 1381051516 }, { "epoch": 0.8744686171542886, "grad_norm": 0.843717098236084, "learning_rate": 2e-05, "loss": 0.5822, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13988, "tokens_per_second_per_gpu": 10660.38, "total_tokens": 1381152367 }, { "epoch": 0.8745311327831958, "grad_norm": 0.8631966710090637, "learning_rate": 2e-05, "loss": 0.5557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13989, "tokens_per_second_per_gpu": 11504.57, "total_tokens": 1381248783 }, { "epoch": 0.8745936484121031, "grad_norm": 0.867001473903656, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13990, "tokens_per_second_per_gpu": 10523.12, "total_tokens": 1381348768 }, { "epoch": 0.8746561640410102, "grad_norm": 0.884543776512146, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13991, "tokens_per_second_per_gpu": 10702.85, "total_tokens": 1381447984 }, { "epoch": 0.8747186796699175, "grad_norm": 0.8918281197547913, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13992, "tokens_per_second_per_gpu": 9605.54, "total_tokens": 1381544667 }, { "epoch": 0.8747811952988247, "grad_norm": 0.882170557975769, "learning_rate": 2e-05, "loss": 0.5566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13993, "tokens_per_second_per_gpu": 10115.67, "total_tokens": 1381638999 }, { "epoch": 0.8748437109277319, "grad_norm": 0.8960003852844238, "learning_rate": 2e-05, "loss": 0.6435, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13994, "tokens_per_second_per_gpu": 10719.57, "total_tokens": 1381741382 }, { "epoch": 0.8749062265566392, "grad_norm": 0.9036737680435181, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13995, "tokens_per_second_per_gpu": 10767.07, "total_tokens": 1381846335 }, { "epoch": 0.8749687421855464, "grad_norm": 0.8737433552742004, "learning_rate": 2e-05, "loss": 0.5734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13996, "tokens_per_second_per_gpu": 10413.56, "total_tokens": 1381946036 }, { "epoch": 0.8750312578144536, "grad_norm": 0.9134939312934875, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13997, "tokens_per_second_per_gpu": 10517.33, "total_tokens": 1382045295 }, { "epoch": 0.8750937734433608, "grad_norm": 0.9057551622390747, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13998, "tokens_per_second_per_gpu": 10596.91, "total_tokens": 1382143519 }, { "epoch": 0.8751562890722681, "grad_norm": 0.8790653347969055, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 13999, "tokens_per_second_per_gpu": 10242.98, "total_tokens": 1382244600 }, { "epoch": 0.8752188047011753, "grad_norm": 0.8853226900100708, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14000, "tokens_per_second_per_gpu": 10198.3, "total_tokens": 1382345064 }, { "epoch": 0.8752813203300825, "grad_norm": 0.9002161622047424, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14001, "tokens_per_second_per_gpu": 10680.98, "total_tokens": 1382443863 }, { "epoch": 0.8753438359589898, "grad_norm": 0.8899539709091187, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14002, "tokens_per_second_per_gpu": 9905.12, "total_tokens": 1382541705 }, { "epoch": 0.8754063515878969, "grad_norm": 0.868446946144104, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14003, "tokens_per_second_per_gpu": 10100.71, "total_tokens": 1382641935 }, { "epoch": 0.8754688672168042, "grad_norm": 0.8922114372253418, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14004, "tokens_per_second_per_gpu": 10527.28, "total_tokens": 1382743023 }, { "epoch": 0.8755313828457114, "grad_norm": 0.9150538444519043, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14005, "tokens_per_second_per_gpu": 10602.08, "total_tokens": 1382842312 }, { "epoch": 0.8755938984746187, "grad_norm": 0.8524549007415771, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14006, "tokens_per_second_per_gpu": 10531.37, "total_tokens": 1382946754 }, { "epoch": 0.8756564141035259, "grad_norm": 0.8718723654747009, "learning_rate": 2e-05, "loss": 0.5936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14007, "tokens_per_second_per_gpu": 11154.08, "total_tokens": 1383046187 }, { "epoch": 0.8757189297324331, "grad_norm": 0.9013551473617554, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14008, "tokens_per_second_per_gpu": 10497.29, "total_tokens": 1383145984 }, { "epoch": 0.8757814453613403, "grad_norm": 0.8770872354507446, "learning_rate": 2e-05, "loss": 0.5638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14009, "tokens_per_second_per_gpu": 9530.47, "total_tokens": 1383239042 }, { "epoch": 0.8758439609902475, "grad_norm": 0.9014446139335632, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14010, "tokens_per_second_per_gpu": 10036.07, "total_tokens": 1383338028 }, { "epoch": 0.8759064766191548, "grad_norm": 0.8626185059547424, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14011, "tokens_per_second_per_gpu": 11143.25, "total_tokens": 1383437451 }, { "epoch": 0.875968992248062, "grad_norm": 0.8995065689086914, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14012, "tokens_per_second_per_gpu": 10184.56, "total_tokens": 1383535335 }, { "epoch": 0.8760315078769693, "grad_norm": 0.876906156539917, "learning_rate": 2e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14013, "tokens_per_second_per_gpu": 10160.31, "total_tokens": 1383631340 }, { "epoch": 0.8760940235058765, "grad_norm": 0.8824569582939148, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14014, "tokens_per_second_per_gpu": 10218.81, "total_tokens": 1383730122 }, { "epoch": 0.8761565391347836, "grad_norm": 0.9076724052429199, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14015, "tokens_per_second_per_gpu": 10176.98, "total_tokens": 1383827881 }, { "epoch": 0.8762190547636909, "grad_norm": 0.9072257280349731, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14016, "tokens_per_second_per_gpu": 9854.7, "total_tokens": 1383925579 }, { "epoch": 0.8762815703925981, "grad_norm": 0.883781373500824, "learning_rate": 2e-05, "loss": 0.577, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14017, "tokens_per_second_per_gpu": 9729.07, "total_tokens": 1384020715 }, { "epoch": 0.8763440860215054, "grad_norm": 0.8735441565513611, "learning_rate": 2e-05, "loss": 0.6052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14018, "tokens_per_second_per_gpu": 9694.07, "total_tokens": 1384121187 }, { "epoch": 0.8764066016504126, "grad_norm": 0.9094491004943848, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14019, "tokens_per_second_per_gpu": 10208.43, "total_tokens": 1384217638 }, { "epoch": 0.8764691172793199, "grad_norm": 0.8924276232719421, "learning_rate": 2e-05, "loss": 0.5696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14020, "tokens_per_second_per_gpu": 9499.8, "total_tokens": 1384311308 }, { "epoch": 0.8765316329082271, "grad_norm": 0.8643739223480225, "learning_rate": 2e-05, "loss": 0.5804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14021, "tokens_per_second_per_gpu": 11206.36, "total_tokens": 1384410999 }, { "epoch": 0.8765941485371342, "grad_norm": 0.8988345861434937, "learning_rate": 2e-05, "loss": 0.6875, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14022, "tokens_per_second_per_gpu": 10997.71, "total_tokens": 1384516955 }, { "epoch": 0.8766566641660415, "grad_norm": 0.9541931748390198, "learning_rate": 2e-05, "loss": 0.5871, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14023, "tokens_per_second_per_gpu": 10719.37, "total_tokens": 1384614152 }, { "epoch": 0.8767191797949487, "grad_norm": 0.8836846947669983, "learning_rate": 2e-05, "loss": 0.5753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14024, "tokens_per_second_per_gpu": 9780.58, "total_tokens": 1384708658 }, { "epoch": 0.876781695423856, "grad_norm": 0.8898729681968689, "learning_rate": 2e-05, "loss": 0.5926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14025, "tokens_per_second_per_gpu": 9915.54, "total_tokens": 1384809185 }, { "epoch": 0.8768442110527632, "grad_norm": 0.8459630012512207, "learning_rate": 2e-05, "loss": 0.5896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14026, "tokens_per_second_per_gpu": 11006.19, "total_tokens": 1384912580 }, { "epoch": 0.8769067266816705, "grad_norm": 0.9090686440467834, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14027, "tokens_per_second_per_gpu": 9969.3, "total_tokens": 1385010553 }, { "epoch": 0.8769692423105776, "grad_norm": 0.8976848125457764, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14028, "tokens_per_second_per_gpu": 10155.38, "total_tokens": 1385104973 }, { "epoch": 0.8770317579394848, "grad_norm": 0.933570384979248, "learning_rate": 2e-05, "loss": 0.5757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14029, "tokens_per_second_per_gpu": 10477.67, "total_tokens": 1385200746 }, { "epoch": 0.8770942735683921, "grad_norm": 0.8907123804092407, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14030, "tokens_per_second_per_gpu": 10854.7, "total_tokens": 1385302110 }, { "epoch": 0.8771567891972993, "grad_norm": 0.9080380201339722, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14031, "tokens_per_second_per_gpu": 10474.06, "total_tokens": 1385401214 }, { "epoch": 0.8772193048262066, "grad_norm": 0.8947010636329651, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14032, "tokens_per_second_per_gpu": 10938.36, "total_tokens": 1385499870 }, { "epoch": 0.8772818204551138, "grad_norm": 0.8662412762641907, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14033, "tokens_per_second_per_gpu": 10553.81, "total_tokens": 1385599154 }, { "epoch": 0.877344336084021, "grad_norm": 0.8710907101631165, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14034, "tokens_per_second_per_gpu": 11489.96, "total_tokens": 1385700959 }, { "epoch": 0.8774068517129282, "grad_norm": 0.9373798370361328, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14035, "tokens_per_second_per_gpu": 10580.75, "total_tokens": 1385797035 }, { "epoch": 0.8774693673418354, "grad_norm": 0.8974055647850037, "learning_rate": 2e-05, "loss": 0.6467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14036, "tokens_per_second_per_gpu": 13973.02, "total_tokens": 1385898189 }, { "epoch": 0.8775318829707427, "grad_norm": 0.8899745345115662, "learning_rate": 2e-05, "loss": 0.6446, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14037, "tokens_per_second_per_gpu": 11138.34, "total_tokens": 1386001822 }, { "epoch": 0.8775943985996499, "grad_norm": 0.8849181532859802, "learning_rate": 2e-05, "loss": 0.5767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14038, "tokens_per_second_per_gpu": 10170.89, "total_tokens": 1386099090 }, { "epoch": 0.8776569142285572, "grad_norm": 0.8817973136901855, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14039, "tokens_per_second_per_gpu": 10358.72, "total_tokens": 1386199922 }, { "epoch": 0.8777194298574643, "grad_norm": 0.8714104890823364, "learning_rate": 2e-05, "loss": 0.5952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14040, "tokens_per_second_per_gpu": 10539.84, "total_tokens": 1386297477 }, { "epoch": 0.8777819454863716, "grad_norm": 0.914964497089386, "learning_rate": 2e-05, "loss": 0.599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14041, "tokens_per_second_per_gpu": 10892.96, "total_tokens": 1386397874 }, { "epoch": 0.8778444611152788, "grad_norm": 0.8803375959396362, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14042, "tokens_per_second_per_gpu": 10210.98, "total_tokens": 1386498533 }, { "epoch": 0.877906976744186, "grad_norm": 0.8784492611885071, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14043, "tokens_per_second_per_gpu": 10517.79, "total_tokens": 1386600697 }, { "epoch": 0.8779694923730933, "grad_norm": 0.902099072933197, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14044, "tokens_per_second_per_gpu": 10304.99, "total_tokens": 1386698155 }, { "epoch": 0.8780320080020005, "grad_norm": 0.8508328795433044, "learning_rate": 2e-05, "loss": 0.5909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14045, "tokens_per_second_per_gpu": 10445.67, "total_tokens": 1386798300 }, { "epoch": 0.8780945236309077, "grad_norm": 0.9127016067504883, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14046, "tokens_per_second_per_gpu": 10021.66, "total_tokens": 1386897638 }, { "epoch": 0.8781570392598149, "grad_norm": 0.8695751428604126, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14047, "tokens_per_second_per_gpu": 10758.38, "total_tokens": 1386998532 }, { "epoch": 0.8782195548887222, "grad_norm": 0.9140003323554993, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14048, "tokens_per_second_per_gpu": 9646.24, "total_tokens": 1387094964 }, { "epoch": 0.8782820705176294, "grad_norm": 0.9146955013275146, "learning_rate": 2e-05, "loss": 0.5791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14049, "tokens_per_second_per_gpu": 10155.04, "total_tokens": 1387190815 }, { "epoch": 0.8783445861465367, "grad_norm": 0.942809522151947, "learning_rate": 2e-05, "loss": 0.626, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14050, "tokens_per_second_per_gpu": 9372.75, "total_tokens": 1387284063 }, { "epoch": 0.8784071017754439, "grad_norm": 0.8712720274925232, "learning_rate": 2e-05, "loss": 0.5877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14051, "tokens_per_second_per_gpu": 9998.54, "total_tokens": 1387380900 }, { "epoch": 0.878469617404351, "grad_norm": 0.9152787923812866, "learning_rate": 2e-05, "loss": 0.6495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14052, "tokens_per_second_per_gpu": 11091.95, "total_tokens": 1387483426 }, { "epoch": 0.8785321330332583, "grad_norm": 0.9098109006881714, "learning_rate": 2e-05, "loss": 0.6283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14053, "tokens_per_second_per_gpu": 11023.53, "total_tokens": 1387583912 }, { "epoch": 0.8785946486621655, "grad_norm": 0.8397692441940308, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14054, "tokens_per_second_per_gpu": 11278.57, "total_tokens": 1387690115 }, { "epoch": 0.8786571642910728, "grad_norm": 0.9062833786010742, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14055, "tokens_per_second_per_gpu": 11476.54, "total_tokens": 1387795213 }, { "epoch": 0.87871967991998, "grad_norm": 0.9224569797515869, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14056, "tokens_per_second_per_gpu": 10189.05, "total_tokens": 1387891098 }, { "epoch": 0.8787821955488873, "grad_norm": 0.9129751324653625, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14057, "tokens_per_second_per_gpu": 10470.78, "total_tokens": 1387985305 }, { "epoch": 0.8788447111777945, "grad_norm": 0.9390965700149536, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14058, "tokens_per_second_per_gpu": 10674.19, "total_tokens": 1388083153 }, { "epoch": 0.8789072268067016, "grad_norm": 0.86991947889328, "learning_rate": 2e-05, "loss": 0.578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14059, "tokens_per_second_per_gpu": 10645.59, "total_tokens": 1388181915 }, { "epoch": 0.8789697424356089, "grad_norm": 0.9075350761413574, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14060, "tokens_per_second_per_gpu": 11291.12, "total_tokens": 1388281769 }, { "epoch": 0.8790322580645161, "grad_norm": 0.921055793762207, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14061, "tokens_per_second_per_gpu": 9876.58, "total_tokens": 1388373042 }, { "epoch": 0.8790947736934234, "grad_norm": 0.869086503982544, "learning_rate": 2e-05, "loss": 0.5818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14062, "tokens_per_second_per_gpu": 10998.58, "total_tokens": 1388471902 }, { "epoch": 0.8791572893223306, "grad_norm": 0.9175443649291992, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14063, "tokens_per_second_per_gpu": 11112.45, "total_tokens": 1388569417 }, { "epoch": 0.8792198049512379, "grad_norm": 0.8501823544502258, "learning_rate": 2e-05, "loss": 0.5833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14064, "tokens_per_second_per_gpu": 10578.28, "total_tokens": 1388669190 }, { "epoch": 0.879282320580145, "grad_norm": 0.8860330581665039, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14065, "tokens_per_second_per_gpu": 10936.27, "total_tokens": 1388770750 }, { "epoch": 0.8793448362090522, "grad_norm": 0.8817200064659119, "learning_rate": 2e-05, "loss": 0.5821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14066, "tokens_per_second_per_gpu": 11183.58, "total_tokens": 1388874389 }, { "epoch": 0.8794073518379595, "grad_norm": 0.8784477114677429, "learning_rate": 2e-05, "loss": 0.6118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14067, "tokens_per_second_per_gpu": 10574.9, "total_tokens": 1388974491 }, { "epoch": 0.8794698674668667, "grad_norm": 0.8728228211402893, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14068, "tokens_per_second_per_gpu": 10369.12, "total_tokens": 1389072137 }, { "epoch": 0.879532383095774, "grad_norm": 0.8561776280403137, "learning_rate": 2e-05, "loss": 0.6815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14069, "tokens_per_second_per_gpu": 11232.87, "total_tokens": 1389177279 }, { "epoch": 0.8795948987246812, "grad_norm": 0.8822944164276123, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14070, "tokens_per_second_per_gpu": 10094.6, "total_tokens": 1389274217 }, { "epoch": 0.8796574143535884, "grad_norm": 0.8663650155067444, "learning_rate": 2e-05, "loss": 0.5973, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14071, "tokens_per_second_per_gpu": 10846.59, "total_tokens": 1389370638 }, { "epoch": 0.8797199299824956, "grad_norm": 0.9227951169013977, "learning_rate": 2e-05, "loss": 0.6546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14072, "tokens_per_second_per_gpu": 10340.31, "total_tokens": 1389468849 }, { "epoch": 0.8797824456114028, "grad_norm": 0.8760689496994019, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14073, "tokens_per_second_per_gpu": 10851.08, "total_tokens": 1389567165 }, { "epoch": 0.8798449612403101, "grad_norm": 0.896280825138092, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14074, "tokens_per_second_per_gpu": 9670.29, "total_tokens": 1389663142 }, { "epoch": 0.8799074768692173, "grad_norm": 0.8832778334617615, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14075, "tokens_per_second_per_gpu": 10702.99, "total_tokens": 1389764334 }, { "epoch": 0.8799699924981246, "grad_norm": 0.8808905482292175, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14076, "tokens_per_second_per_gpu": 9970.37, "total_tokens": 1389862208 }, { "epoch": 0.8800325081270317, "grad_norm": 0.8990031480789185, "learning_rate": 2e-05, "loss": 0.586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14077, "tokens_per_second_per_gpu": 9764.07, "total_tokens": 1389957066 }, { "epoch": 0.880095023755939, "grad_norm": 0.8977758288383484, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14078, "tokens_per_second_per_gpu": 9817.39, "total_tokens": 1390052980 }, { "epoch": 0.8801575393848462, "grad_norm": 0.8963944911956787, "learning_rate": 2e-05, "loss": 0.6474, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14079, "tokens_per_second_per_gpu": 10659.31, "total_tokens": 1390156509 }, { "epoch": 0.8802200550137534, "grad_norm": 0.9170811772346497, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14080, "tokens_per_second_per_gpu": 11111.79, "total_tokens": 1390256511 }, { "epoch": 0.8802825706426607, "grad_norm": 0.9011141061782837, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14081, "tokens_per_second_per_gpu": 11295.36, "total_tokens": 1390356977 }, { "epoch": 0.8803450862715679, "grad_norm": 0.8819741606712341, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14082, "tokens_per_second_per_gpu": 10851.39, "total_tokens": 1390459226 }, { "epoch": 0.8804076019004751, "grad_norm": 0.8870566487312317, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14083, "tokens_per_second_per_gpu": 11346.33, "total_tokens": 1390562706 }, { "epoch": 0.8804701175293823, "grad_norm": 0.9443657994270325, "learning_rate": 2e-05, "loss": 0.605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14084, "tokens_per_second_per_gpu": 9924.44, "total_tokens": 1390660607 }, { "epoch": 0.8805326331582896, "grad_norm": 0.8924880623817444, "learning_rate": 2e-05, "loss": 0.5926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14085, "tokens_per_second_per_gpu": 10837.19, "total_tokens": 1390759012 }, { "epoch": 0.8805951487871968, "grad_norm": 0.9456847310066223, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14086, "tokens_per_second_per_gpu": 11507.88, "total_tokens": 1390854410 }, { "epoch": 0.880657664416104, "grad_norm": 0.9161606431007385, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14087, "tokens_per_second_per_gpu": 10685.53, "total_tokens": 1390955650 }, { "epoch": 0.8807201800450113, "grad_norm": 0.8487164378166199, "learning_rate": 2e-05, "loss": 0.5694, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14088, "tokens_per_second_per_gpu": 10039.64, "total_tokens": 1391051580 }, { "epoch": 0.8807826956739184, "grad_norm": 0.8913889527320862, "learning_rate": 2e-05, "loss": 0.581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14089, "tokens_per_second_per_gpu": 10975.26, "total_tokens": 1391150367 }, { "epoch": 0.8808452113028257, "grad_norm": 0.8975762128829956, "learning_rate": 2e-05, "loss": 0.5952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14090, "tokens_per_second_per_gpu": 10056.09, "total_tokens": 1391246301 }, { "epoch": 0.8809077269317329, "grad_norm": 0.8902174234390259, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14091, "tokens_per_second_per_gpu": 10641.79, "total_tokens": 1391345239 }, { "epoch": 0.8809702425606402, "grad_norm": 0.8924885988235474, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14092, "tokens_per_second_per_gpu": 10308.35, "total_tokens": 1391439386 }, { "epoch": 0.8810327581895474, "grad_norm": 0.8914601802825928, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14093, "tokens_per_second_per_gpu": 10802.74, "total_tokens": 1391537469 }, { "epoch": 0.8810952738184546, "grad_norm": 0.8813304901123047, "learning_rate": 2e-05, "loss": 0.5897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14094, "tokens_per_second_per_gpu": 10661.0, "total_tokens": 1391634835 }, { "epoch": 0.8811577894473619, "grad_norm": 0.8997824192047119, "learning_rate": 2e-05, "loss": 0.592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14095, "tokens_per_second_per_gpu": 10087.06, "total_tokens": 1391728250 }, { "epoch": 0.881220305076269, "grad_norm": 0.9288287162780762, "learning_rate": 2e-05, "loss": 0.569, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14096, "tokens_per_second_per_gpu": 10321.86, "total_tokens": 1391825803 }, { "epoch": 0.8812828207051763, "grad_norm": 0.8952553272247314, "learning_rate": 2e-05, "loss": 0.575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14097, "tokens_per_second_per_gpu": 9765.39, "total_tokens": 1391921407 }, { "epoch": 0.8813453363340835, "grad_norm": 0.9292277693748474, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14098, "tokens_per_second_per_gpu": 10042.39, "total_tokens": 1392019347 }, { "epoch": 0.8814078519629908, "grad_norm": 0.8865852952003479, "learning_rate": 2e-05, "loss": 0.5887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14099, "tokens_per_second_per_gpu": 9965.69, "total_tokens": 1392115465 }, { "epoch": 0.881470367591898, "grad_norm": 0.9060023427009583, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14100, "tokens_per_second_per_gpu": 9430.3, "total_tokens": 1392209766 }, { "epoch": 0.8815328832208053, "grad_norm": 0.9061400890350342, "learning_rate": 2e-05, "loss": 0.6408, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14101, "tokens_per_second_per_gpu": 11384.87, "total_tokens": 1392309398 }, { "epoch": 0.8815953988497124, "grad_norm": 0.8929615020751953, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14102, "tokens_per_second_per_gpu": 10731.2, "total_tokens": 1392407737 }, { "epoch": 0.8816579144786196, "grad_norm": 0.8833010196685791, "learning_rate": 2e-05, "loss": 0.6422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14103, "tokens_per_second_per_gpu": 10333.96, "total_tokens": 1392506823 }, { "epoch": 0.8817204301075269, "grad_norm": 0.8944228887557983, "learning_rate": 2e-05, "loss": 0.5479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14104, "tokens_per_second_per_gpu": 9949.36, "total_tokens": 1392597570 }, { "epoch": 0.8817829457364341, "grad_norm": 0.8534738421440125, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14105, "tokens_per_second_per_gpu": 10951.46, "total_tokens": 1392702616 }, { "epoch": 0.8818454613653414, "grad_norm": 0.859887421131134, "learning_rate": 2e-05, "loss": 0.6089, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14106, "tokens_per_second_per_gpu": 10468.35, "total_tokens": 1392803075 }, { "epoch": 0.8819079769942486, "grad_norm": 0.897443413734436, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14107, "tokens_per_second_per_gpu": 10641.74, "total_tokens": 1392903630 }, { "epoch": 0.8819704926231557, "grad_norm": 0.8983637690544128, "learning_rate": 2e-05, "loss": 0.5964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14108, "tokens_per_second_per_gpu": 9740.81, "total_tokens": 1392996446 }, { "epoch": 0.882033008252063, "grad_norm": 0.8994103670120239, "learning_rate": 2e-05, "loss": 0.6881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14109, "tokens_per_second_per_gpu": 10811.14, "total_tokens": 1393096691 }, { "epoch": 0.8820955238809702, "grad_norm": 0.8583600521087646, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14110, "tokens_per_second_per_gpu": 10314.25, "total_tokens": 1393197985 }, { "epoch": 0.8821580395098775, "grad_norm": 0.9047970771789551, "learning_rate": 2e-05, "loss": 0.6048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14111, "tokens_per_second_per_gpu": 10399.43, "total_tokens": 1393293436 }, { "epoch": 0.8822205551387847, "grad_norm": 0.9815313220024109, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14112, "tokens_per_second_per_gpu": 11052.61, "total_tokens": 1393396232 }, { "epoch": 0.882283070767692, "grad_norm": 0.92882239818573, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14113, "tokens_per_second_per_gpu": 10689.62, "total_tokens": 1393497237 }, { "epoch": 0.8823455863965991, "grad_norm": 0.889521062374115, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14114, "tokens_per_second_per_gpu": 10150.17, "total_tokens": 1393595959 }, { "epoch": 0.8824081020255063, "grad_norm": 0.8913754224777222, "learning_rate": 2e-05, "loss": 0.594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14115, "tokens_per_second_per_gpu": 10322.2, "total_tokens": 1393693650 }, { "epoch": 0.8824706176544136, "grad_norm": 0.8850934505462646, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14116, "tokens_per_second_per_gpu": 10700.63, "total_tokens": 1393795375 }, { "epoch": 0.8825331332833208, "grad_norm": 0.9020592570304871, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14117, "tokens_per_second_per_gpu": 11006.02, "total_tokens": 1393894580 }, { "epoch": 0.8825956489122281, "grad_norm": 0.9190046191215515, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14118, "tokens_per_second_per_gpu": 9842.73, "total_tokens": 1393989291 }, { "epoch": 0.8826581645411353, "grad_norm": 0.840023398399353, "learning_rate": 2e-05, "loss": 0.5698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14119, "tokens_per_second_per_gpu": 10918.05, "total_tokens": 1394089358 }, { "epoch": 0.8827206801700425, "grad_norm": 0.8986076712608337, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14120, "tokens_per_second_per_gpu": 10400.08, "total_tokens": 1394186518 }, { "epoch": 0.8827831957989497, "grad_norm": 0.9142897129058838, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14121, "tokens_per_second_per_gpu": 10141.41, "total_tokens": 1394284838 }, { "epoch": 0.882845711427857, "grad_norm": 0.8750865459442139, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14122, "tokens_per_second_per_gpu": 11349.63, "total_tokens": 1394387871 }, { "epoch": 0.8829082270567642, "grad_norm": 0.8527308702468872, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14123, "tokens_per_second_per_gpu": 10454.53, "total_tokens": 1394487066 }, { "epoch": 0.8829707426856714, "grad_norm": 0.8782795071601868, "learning_rate": 2e-05, "loss": 0.6018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14124, "tokens_per_second_per_gpu": 10598.15, "total_tokens": 1394586450 }, { "epoch": 0.8830332583145787, "grad_norm": 0.886412501335144, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14125, "tokens_per_second_per_gpu": 10848.34, "total_tokens": 1394685224 }, { "epoch": 0.8830957739434858, "grad_norm": 0.9025453329086304, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14126, "tokens_per_second_per_gpu": 10844.64, "total_tokens": 1394784047 }, { "epoch": 0.8831582895723931, "grad_norm": 0.9062568545341492, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14127, "tokens_per_second_per_gpu": 9993.21, "total_tokens": 1394882644 }, { "epoch": 0.8832208052013003, "grad_norm": 0.8824983239173889, "learning_rate": 2e-05, "loss": 0.6185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14128, "tokens_per_second_per_gpu": 10289.84, "total_tokens": 1394981803 }, { "epoch": 0.8832833208302076, "grad_norm": 0.9110879302024841, "learning_rate": 2e-05, "loss": 0.6045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14129, "tokens_per_second_per_gpu": 10876.29, "total_tokens": 1395079359 }, { "epoch": 0.8833458364591148, "grad_norm": 0.8807969689369202, "learning_rate": 2e-05, "loss": 0.5663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14130, "tokens_per_second_per_gpu": 10232.02, "total_tokens": 1395176396 }, { "epoch": 0.883408352088022, "grad_norm": 0.8758055567741394, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14131, "tokens_per_second_per_gpu": 10812.49, "total_tokens": 1395276035 }, { "epoch": 0.8834708677169293, "grad_norm": 0.8820188045501709, "learning_rate": 2e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14132, "tokens_per_second_per_gpu": 10586.53, "total_tokens": 1395375533 }, { "epoch": 0.8835333833458364, "grad_norm": 0.9038748145103455, "learning_rate": 2e-05, "loss": 0.5609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14133, "tokens_per_second_per_gpu": 10756.38, "total_tokens": 1395471504 }, { "epoch": 0.8835958989747437, "grad_norm": 0.9260058999061584, "learning_rate": 2e-05, "loss": 0.6044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14134, "tokens_per_second_per_gpu": 8827.67, "total_tokens": 1395563977 }, { "epoch": 0.8836584146036509, "grad_norm": 0.8908121585845947, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14135, "tokens_per_second_per_gpu": 10344.33, "total_tokens": 1395663820 }, { "epoch": 0.8837209302325582, "grad_norm": 0.869446337223053, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14136, "tokens_per_second_per_gpu": 11876.3, "total_tokens": 1395765297 }, { "epoch": 0.8837834458614654, "grad_norm": 0.8711059093475342, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14137, "tokens_per_second_per_gpu": 10887.87, "total_tokens": 1395866909 }, { "epoch": 0.8838459614903726, "grad_norm": 0.8894590735435486, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14138, "tokens_per_second_per_gpu": 9596.22, "total_tokens": 1395966227 }, { "epoch": 0.8839084771192798, "grad_norm": 0.8805057406425476, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14139, "tokens_per_second_per_gpu": 10806.5, "total_tokens": 1396063779 }, { "epoch": 0.883970992748187, "grad_norm": 0.8572400212287903, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14140, "tokens_per_second_per_gpu": 10809.04, "total_tokens": 1396163956 }, { "epoch": 0.8840335083770943, "grad_norm": 0.8629043102264404, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14141, "tokens_per_second_per_gpu": 10914.2, "total_tokens": 1396263435 }, { "epoch": 0.8840960240060015, "grad_norm": 0.8675619959831238, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14142, "tokens_per_second_per_gpu": 10754.39, "total_tokens": 1396363181 }, { "epoch": 0.8841585396349088, "grad_norm": 0.933616578578949, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14143, "tokens_per_second_per_gpu": 10668.89, "total_tokens": 1396465135 }, { "epoch": 0.884221055263816, "grad_norm": 0.8952988982200623, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14144, "tokens_per_second_per_gpu": 10579.91, "total_tokens": 1396563264 }, { "epoch": 0.8842835708927231, "grad_norm": 0.918321967124939, "learning_rate": 2e-05, "loss": 0.5916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14145, "tokens_per_second_per_gpu": 10387.91, "total_tokens": 1396659604 }, { "epoch": 0.8843460865216304, "grad_norm": 0.8798362612724304, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14146, "tokens_per_second_per_gpu": 10463.65, "total_tokens": 1396761970 }, { "epoch": 0.8844086021505376, "grad_norm": 0.8673505783081055, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14147, "tokens_per_second_per_gpu": 10430.71, "total_tokens": 1396864666 }, { "epoch": 0.8844711177794449, "grad_norm": 0.851615846157074, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14148, "tokens_per_second_per_gpu": 10953.14, "total_tokens": 1396966863 }, { "epoch": 0.8845336334083521, "grad_norm": 0.88299959897995, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14149, "tokens_per_second_per_gpu": 10185.36, "total_tokens": 1397062194 }, { "epoch": 0.8845961490372594, "grad_norm": 0.8858991861343384, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14150, "tokens_per_second_per_gpu": 11190.43, "total_tokens": 1397161159 }, { "epoch": 0.8846586646661665, "grad_norm": 0.98511803150177, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14151, "tokens_per_second_per_gpu": 10231.45, "total_tokens": 1397255982 }, { "epoch": 0.8847211802950737, "grad_norm": 0.8866431713104248, "learning_rate": 2e-05, "loss": 0.5877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14152, "tokens_per_second_per_gpu": 10909.73, "total_tokens": 1397357548 }, { "epoch": 0.884783695923981, "grad_norm": 0.8718945384025574, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14153, "tokens_per_second_per_gpu": 11071.36, "total_tokens": 1397458216 }, { "epoch": 0.8848462115528882, "grad_norm": 0.9437318444252014, "learning_rate": 2e-05, "loss": 0.6236, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14154, "tokens_per_second_per_gpu": 10704.08, "total_tokens": 1397559019 }, { "epoch": 0.8849087271817955, "grad_norm": 0.9697861075401306, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14155, "tokens_per_second_per_gpu": 11058.13, "total_tokens": 1397658086 }, { "epoch": 0.8849712428107027, "grad_norm": 0.8985291123390198, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14156, "tokens_per_second_per_gpu": 9902.92, "total_tokens": 1397754552 }, { "epoch": 0.8850337584396099, "grad_norm": 0.9358238577842712, "learning_rate": 2e-05, "loss": 0.5573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14157, "tokens_per_second_per_gpu": 9773.83, "total_tokens": 1397846927 }, { "epoch": 0.8850962740685171, "grad_norm": 0.8762392401695251, "learning_rate": 2e-05, "loss": 0.6, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14158, "tokens_per_second_per_gpu": 10594.24, "total_tokens": 1397944733 }, { "epoch": 0.8851587896974243, "grad_norm": 0.870883584022522, "learning_rate": 2e-05, "loss": 0.6037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14159, "tokens_per_second_per_gpu": 10210.22, "total_tokens": 1398044747 }, { "epoch": 0.8852213053263316, "grad_norm": 0.8847946524620056, "learning_rate": 2e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14160, "tokens_per_second_per_gpu": 10450.01, "total_tokens": 1398144323 }, { "epoch": 0.8852838209552388, "grad_norm": 0.9057481288909912, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14161, "tokens_per_second_per_gpu": 10605.06, "total_tokens": 1398244939 }, { "epoch": 0.8853463365841461, "grad_norm": 0.8924825191497803, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14162, "tokens_per_second_per_gpu": 10581.47, "total_tokens": 1398346017 }, { "epoch": 0.8854088522130532, "grad_norm": 0.8830528855323792, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14163, "tokens_per_second_per_gpu": 10787.96, "total_tokens": 1398450986 }, { "epoch": 0.8854713678419605, "grad_norm": 0.8642361760139465, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14164, "tokens_per_second_per_gpu": 9815.83, "total_tokens": 1398546249 }, { "epoch": 0.8855338834708677, "grad_norm": 0.8673847913742065, "learning_rate": 2e-05, "loss": 0.5847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14165, "tokens_per_second_per_gpu": 10818.69, "total_tokens": 1398643340 }, { "epoch": 0.885596399099775, "grad_norm": 0.8733874559402466, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14166, "tokens_per_second_per_gpu": 10356.83, "total_tokens": 1398741935 }, { "epoch": 0.8856589147286822, "grad_norm": 0.923372209072113, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14167, "tokens_per_second_per_gpu": 9656.26, "total_tokens": 1398837147 }, { "epoch": 0.8857214303575894, "grad_norm": 0.9210196137428284, "learning_rate": 2e-05, "loss": 0.5955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14168, "tokens_per_second_per_gpu": 10703.03, "total_tokens": 1398937113 }, { "epoch": 0.8857839459864967, "grad_norm": 0.9016172885894775, "learning_rate": 2e-05, "loss": 0.5857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14169, "tokens_per_second_per_gpu": 10696.05, "total_tokens": 1399033344 }, { "epoch": 0.8858464616154038, "grad_norm": 0.9189367294311523, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14170, "tokens_per_second_per_gpu": 10599.29, "total_tokens": 1399133315 }, { "epoch": 0.8859089772443111, "grad_norm": 0.8821700215339661, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14171, "tokens_per_second_per_gpu": 10724.69, "total_tokens": 1399236748 }, { "epoch": 0.8859714928732183, "grad_norm": 0.9731414914131165, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14172, "tokens_per_second_per_gpu": 10657.13, "total_tokens": 1399335870 }, { "epoch": 0.8860340085021255, "grad_norm": 0.9375208020210266, "learning_rate": 2e-05, "loss": 0.6515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14173, "tokens_per_second_per_gpu": 10308.72, "total_tokens": 1399436013 }, { "epoch": 0.8860965241310328, "grad_norm": 0.8702258467674255, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14174, "tokens_per_second_per_gpu": 10761.33, "total_tokens": 1399535620 }, { "epoch": 0.88615903975994, "grad_norm": 0.9111195206642151, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14175, "tokens_per_second_per_gpu": 10301.9, "total_tokens": 1399631196 }, { "epoch": 0.8862215553888472, "grad_norm": 0.8925947546958923, "learning_rate": 2e-05, "loss": 0.5987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14176, "tokens_per_second_per_gpu": 10320.28, "total_tokens": 1399729110 }, { "epoch": 0.8862840710177544, "grad_norm": 0.8657130599021912, "learning_rate": 2e-05, "loss": 0.5955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14177, "tokens_per_second_per_gpu": 10355.81, "total_tokens": 1399829010 }, { "epoch": 0.8863465866466617, "grad_norm": 0.8681868314743042, "learning_rate": 2e-05, "loss": 0.5612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14178, "tokens_per_second_per_gpu": 10358.52, "total_tokens": 1399926220 }, { "epoch": 0.8864091022755689, "grad_norm": 0.8722896575927734, "learning_rate": 2e-05, "loss": 0.5625, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14179, "tokens_per_second_per_gpu": 10708.91, "total_tokens": 1400025428 }, { "epoch": 0.8864716179044762, "grad_norm": 0.9175595045089722, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14180, "tokens_per_second_per_gpu": 10451.62, "total_tokens": 1400122784 }, { "epoch": 0.8865341335333834, "grad_norm": 0.9165650010108948, "learning_rate": 2e-05, "loss": 0.5696, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14181, "tokens_per_second_per_gpu": 9710.16, "total_tokens": 1400214299 }, { "epoch": 0.8865966491622905, "grad_norm": 0.8936074376106262, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14182, "tokens_per_second_per_gpu": 9610.22, "total_tokens": 1400312213 }, { "epoch": 0.8866591647911978, "grad_norm": 0.8817901015281677, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14183, "tokens_per_second_per_gpu": 10796.15, "total_tokens": 1400414041 }, { "epoch": 0.886721680420105, "grad_norm": 0.8721367716789246, "learning_rate": 2e-05, "loss": 0.5855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14184, "tokens_per_second_per_gpu": 9850.83, "total_tokens": 1400511069 }, { "epoch": 0.8867841960490123, "grad_norm": 0.9064295291900635, "learning_rate": 2e-05, "loss": 0.5651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14185, "tokens_per_second_per_gpu": 10627.63, "total_tokens": 1400605316 }, { "epoch": 0.8868467116779195, "grad_norm": 0.9049756526947021, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14186, "tokens_per_second_per_gpu": 11203.14, "total_tokens": 1400709297 }, { "epoch": 0.8869092273068268, "grad_norm": 0.8884482383728027, "learning_rate": 2e-05, "loss": 0.6052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14187, "tokens_per_second_per_gpu": 10510.33, "total_tokens": 1400804494 }, { "epoch": 0.8869717429357339, "grad_norm": 0.9048557281494141, "learning_rate": 2e-05, "loss": 0.6308, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14188, "tokens_per_second_per_gpu": 10251.19, "total_tokens": 1400904499 }, { "epoch": 0.8870342585646411, "grad_norm": 0.8623024225234985, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14189, "tokens_per_second_per_gpu": 11389.39, "total_tokens": 1401007065 }, { "epoch": 0.8870967741935484, "grad_norm": 0.8794708847999573, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14190, "tokens_per_second_per_gpu": 10991.31, "total_tokens": 1401109269 }, { "epoch": 0.8871592898224556, "grad_norm": 0.9224996566772461, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14191, "tokens_per_second_per_gpu": 11121.12, "total_tokens": 1401206655 }, { "epoch": 0.8872218054513629, "grad_norm": 0.908849835395813, "learning_rate": 2e-05, "loss": 0.6652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14192, "tokens_per_second_per_gpu": 10437.58, "total_tokens": 1401304313 }, { "epoch": 0.8872843210802701, "grad_norm": 0.8742552995681763, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14193, "tokens_per_second_per_gpu": 10600.51, "total_tokens": 1401401458 }, { "epoch": 0.8873468367091772, "grad_norm": 0.9077031016349792, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14194, "tokens_per_second_per_gpu": 10445.72, "total_tokens": 1401502151 }, { "epoch": 0.8874093523380845, "grad_norm": 0.9262779355049133, "learning_rate": 2e-05, "loss": 0.6574, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14195, "tokens_per_second_per_gpu": 10867.16, "total_tokens": 1401605135 }, { "epoch": 0.8874718679669917, "grad_norm": 0.8769572973251343, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14196, "tokens_per_second_per_gpu": 10892.02, "total_tokens": 1401705771 }, { "epoch": 0.887534383595899, "grad_norm": 0.9137774109840393, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14197, "tokens_per_second_per_gpu": 10765.74, "total_tokens": 1401805892 }, { "epoch": 0.8875968992248062, "grad_norm": 0.9141840934753418, "learning_rate": 2e-05, "loss": 0.5974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14198, "tokens_per_second_per_gpu": 10409.31, "total_tokens": 1401900241 }, { "epoch": 0.8876594148537135, "grad_norm": 0.8673321008682251, "learning_rate": 2e-05, "loss": 0.5895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14199, "tokens_per_second_per_gpu": 10418.11, "total_tokens": 1402001088 }, { "epoch": 0.8877219304826206, "grad_norm": 0.8805694580078125, "learning_rate": 2e-05, "loss": 0.6221, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14200, "tokens_per_second_per_gpu": 10038.65, "total_tokens": 1402100137 }, { "epoch": 0.8877844461115278, "grad_norm": 0.8548018932342529, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14201, "tokens_per_second_per_gpu": 10623.45, "total_tokens": 1402198193 }, { "epoch": 0.8878469617404351, "grad_norm": 0.8805901408195496, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14202, "tokens_per_second_per_gpu": 10987.79, "total_tokens": 1402297009 }, { "epoch": 0.8879094773693423, "grad_norm": 0.9057409167289734, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14203, "tokens_per_second_per_gpu": 10427.53, "total_tokens": 1402396425 }, { "epoch": 0.8879719929982496, "grad_norm": 0.8886865377426147, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14204, "tokens_per_second_per_gpu": 10086.27, "total_tokens": 1402492628 }, { "epoch": 0.8880345086271568, "grad_norm": 0.8788610100746155, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14205, "tokens_per_second_per_gpu": 10925.46, "total_tokens": 1402596229 }, { "epoch": 0.8880970242560641, "grad_norm": 0.9073407053947449, "learning_rate": 2e-05, "loss": 0.546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14206, "tokens_per_second_per_gpu": 9715.79, "total_tokens": 1402688595 }, { "epoch": 0.8881595398849712, "grad_norm": 0.8915235996246338, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14207, "tokens_per_second_per_gpu": 10125.31, "total_tokens": 1402786913 }, { "epoch": 0.8882220555138785, "grad_norm": 0.8491614460945129, "learning_rate": 2e-05, "loss": 0.6044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14208, "tokens_per_second_per_gpu": 10748.06, "total_tokens": 1402889109 }, { "epoch": 0.8882845711427857, "grad_norm": 0.8707726001739502, "learning_rate": 2e-05, "loss": 0.5904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14209, "tokens_per_second_per_gpu": 10590.12, "total_tokens": 1402987834 }, { "epoch": 0.8883470867716929, "grad_norm": 0.883323609828949, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14210, "tokens_per_second_per_gpu": 10886.72, "total_tokens": 1403084470 }, { "epoch": 0.8884096024006002, "grad_norm": 0.8979836702346802, "learning_rate": 2e-05, "loss": 0.6699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14211, "tokens_per_second_per_gpu": 11335.7, "total_tokens": 1403189390 }, { "epoch": 0.8884721180295074, "grad_norm": 0.9008039236068726, "learning_rate": 2e-05, "loss": 0.6765, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14212, "tokens_per_second_per_gpu": 11058.55, "total_tokens": 1403290032 }, { "epoch": 0.8885346336584146, "grad_norm": 0.9241250157356262, "learning_rate": 2e-05, "loss": 0.617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14213, "tokens_per_second_per_gpu": 10386.7, "total_tokens": 1403390588 }, { "epoch": 0.8885971492873218, "grad_norm": 0.8940707445144653, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14214, "tokens_per_second_per_gpu": 10879.51, "total_tokens": 1403492546 }, { "epoch": 0.888659664916229, "grad_norm": 0.88187575340271, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14215, "tokens_per_second_per_gpu": 10497.53, "total_tokens": 1403592346 }, { "epoch": 0.8887221805451363, "grad_norm": 0.8579748868942261, "learning_rate": 2e-05, "loss": 0.5836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14216, "tokens_per_second_per_gpu": 11304.69, "total_tokens": 1403691557 }, { "epoch": 0.8887846961740435, "grad_norm": 0.9055495858192444, "learning_rate": 2e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14217, "tokens_per_second_per_gpu": 9602.43, "total_tokens": 1403787340 }, { "epoch": 0.8888472118029508, "grad_norm": 0.8848829865455627, "learning_rate": 2e-05, "loss": 0.6066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14218, "tokens_per_second_per_gpu": 10895.02, "total_tokens": 1403885597 }, { "epoch": 0.8889097274318579, "grad_norm": 0.9058398008346558, "learning_rate": 2e-05, "loss": 0.6587, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14219, "tokens_per_second_per_gpu": 10199.91, "total_tokens": 1403984213 }, { "epoch": 0.8889722430607652, "grad_norm": 0.883949875831604, "learning_rate": 2e-05, "loss": 0.5525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14220, "tokens_per_second_per_gpu": 9755.13, "total_tokens": 1404078967 }, { "epoch": 0.8890347586896724, "grad_norm": 0.9031848311424255, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14221, "tokens_per_second_per_gpu": 10649.24, "total_tokens": 1404176660 }, { "epoch": 0.8890972743185797, "grad_norm": 0.8729172348976135, "learning_rate": 2e-05, "loss": 0.622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14222, "tokens_per_second_per_gpu": 10825.38, "total_tokens": 1404277733 }, { "epoch": 0.8891597899474869, "grad_norm": 0.8725159764289856, "learning_rate": 2e-05, "loss": 0.5794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14223, "tokens_per_second_per_gpu": 10096.96, "total_tokens": 1404374522 }, { "epoch": 0.8892223055763941, "grad_norm": 0.8739786744117737, "learning_rate": 2e-05, "loss": 0.5902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14224, "tokens_per_second_per_gpu": 11044.3, "total_tokens": 1404476313 }, { "epoch": 0.8892848212053013, "grad_norm": 0.8767107129096985, "learning_rate": 2e-05, "loss": 0.5779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14225, "tokens_per_second_per_gpu": 10043.82, "total_tokens": 1404573522 }, { "epoch": 0.8893473368342085, "grad_norm": 0.9049597978591919, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14226, "tokens_per_second_per_gpu": 9678.18, "total_tokens": 1404669555 }, { "epoch": 0.8894098524631158, "grad_norm": 0.9471210837364197, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14227, "tokens_per_second_per_gpu": 10497.37, "total_tokens": 1404769246 }, { "epoch": 0.889472368092023, "grad_norm": 0.8560266494750977, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14228, "tokens_per_second_per_gpu": 10513.68, "total_tokens": 1404867316 }, { "epoch": 0.8895348837209303, "grad_norm": 0.8783185482025146, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14229, "tokens_per_second_per_gpu": 10206.82, "total_tokens": 1404963421 }, { "epoch": 0.8895973993498375, "grad_norm": 0.8967400193214417, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14230, "tokens_per_second_per_gpu": 10778.28, "total_tokens": 1405057783 }, { "epoch": 0.8896599149787446, "grad_norm": 0.8816189765930176, "learning_rate": 2e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14231, "tokens_per_second_per_gpu": 10458.31, "total_tokens": 1405155191 }, { "epoch": 0.8897224306076519, "grad_norm": 0.866591215133667, "learning_rate": 2e-05, "loss": 0.5726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14232, "tokens_per_second_per_gpu": 10377.29, "total_tokens": 1405252453 }, { "epoch": 0.8897849462365591, "grad_norm": 0.9324222207069397, "learning_rate": 2e-05, "loss": 0.637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14233, "tokens_per_second_per_gpu": 10621.34, "total_tokens": 1405351025 }, { "epoch": 0.8898474618654664, "grad_norm": 0.896584689617157, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14234, "tokens_per_second_per_gpu": 10220.7, "total_tokens": 1405449323 }, { "epoch": 0.8899099774943736, "grad_norm": 0.8699796199798584, "learning_rate": 2e-05, "loss": 0.6003, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14235, "tokens_per_second_per_gpu": 10680.38, "total_tokens": 1405553645 }, { "epoch": 0.8899724931232809, "grad_norm": 0.9997706413269043, "learning_rate": 2e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14236, "tokens_per_second_per_gpu": 10114.58, "total_tokens": 1405653099 }, { "epoch": 0.890035008752188, "grad_norm": 0.9309486150741577, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14237, "tokens_per_second_per_gpu": 10610.44, "total_tokens": 1405749482 }, { "epoch": 0.8900975243810952, "grad_norm": 0.8964413404464722, "learning_rate": 2e-05, "loss": 0.5752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14238, "tokens_per_second_per_gpu": 10386.84, "total_tokens": 1405846507 }, { "epoch": 0.8901600400100025, "grad_norm": 0.8382183313369751, "learning_rate": 2e-05, "loss": 0.5827, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14239, "tokens_per_second_per_gpu": 10630.34, "total_tokens": 1405946499 }, { "epoch": 0.8902225556389097, "grad_norm": 0.901461660861969, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14240, "tokens_per_second_per_gpu": 11041.03, "total_tokens": 1406046525 }, { "epoch": 0.890285071267817, "grad_norm": 0.8782004714012146, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14241, "tokens_per_second_per_gpu": 11171.33, "total_tokens": 1406149668 }, { "epoch": 0.8903475868967242, "grad_norm": 0.8651037216186523, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14242, "tokens_per_second_per_gpu": 10946.69, "total_tokens": 1406249818 }, { "epoch": 0.8904101025256314, "grad_norm": 0.9209635853767395, "learning_rate": 2e-05, "loss": 0.5684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14243, "tokens_per_second_per_gpu": 10169.92, "total_tokens": 1406346355 }, { "epoch": 0.8904726181545386, "grad_norm": 0.9059682488441467, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14244, "tokens_per_second_per_gpu": 11064.01, "total_tokens": 1406445748 }, { "epoch": 0.8905351337834458, "grad_norm": 0.8695595860481262, "learning_rate": 2e-05, "loss": 0.5328, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14245, "tokens_per_second_per_gpu": 9606.09, "total_tokens": 1406537838 }, { "epoch": 0.8905976494123531, "grad_norm": 0.8931178450584412, "learning_rate": 2e-05, "loss": 0.5963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14246, "tokens_per_second_per_gpu": 10336.74, "total_tokens": 1406636640 }, { "epoch": 0.8906601650412603, "grad_norm": 0.9210686087608337, "learning_rate": 2e-05, "loss": 0.603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14247, "tokens_per_second_per_gpu": 9360.46, "total_tokens": 1406726657 }, { "epoch": 0.8907226806701676, "grad_norm": 0.8579222559928894, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14248, "tokens_per_second_per_gpu": 11647.41, "total_tokens": 1406829256 }, { "epoch": 0.8907851962990748, "grad_norm": 0.8870116472244263, "learning_rate": 2e-05, "loss": 0.5917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14249, "tokens_per_second_per_gpu": 10426.97, "total_tokens": 1406926853 }, { "epoch": 0.890847711927982, "grad_norm": 0.8725581765174866, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14250, "tokens_per_second_per_gpu": 10520.44, "total_tokens": 1407026109 }, { "epoch": 0.8909102275568892, "grad_norm": 0.9038141965866089, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14251, "tokens_per_second_per_gpu": 11309.45, "total_tokens": 1407129610 }, { "epoch": 0.8909727431857964, "grad_norm": 0.8983489871025085, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14252, "tokens_per_second_per_gpu": 11274.75, "total_tokens": 1407229847 }, { "epoch": 0.8910352588147037, "grad_norm": 0.9041839241981506, "learning_rate": 2e-05, "loss": 0.599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14253, "tokens_per_second_per_gpu": 9993.46, "total_tokens": 1407326662 }, { "epoch": 0.8910977744436109, "grad_norm": 0.9028972387313843, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14254, "tokens_per_second_per_gpu": 9503.95, "total_tokens": 1407421293 }, { "epoch": 0.8911602900725182, "grad_norm": 0.9146806597709656, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14255, "tokens_per_second_per_gpu": 10271.47, "total_tokens": 1407520245 }, { "epoch": 0.8912228057014253, "grad_norm": 0.9113225340843201, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14256, "tokens_per_second_per_gpu": 10296.65, "total_tokens": 1407617104 }, { "epoch": 0.8912853213303326, "grad_norm": 0.8942742943763733, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14257, "tokens_per_second_per_gpu": 10353.94, "total_tokens": 1407718668 }, { "epoch": 0.8913478369592398, "grad_norm": 0.9321368932723999, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14258, "tokens_per_second_per_gpu": 9964.82, "total_tokens": 1407814216 }, { "epoch": 0.891410352588147, "grad_norm": 0.9495812654495239, "learning_rate": 2e-05, "loss": 0.5736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14259, "tokens_per_second_per_gpu": 9968.34, "total_tokens": 1407904325 }, { "epoch": 0.8914728682170543, "grad_norm": 0.8801282048225403, "learning_rate": 2e-05, "loss": 0.6597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14260, "tokens_per_second_per_gpu": 10751.78, "total_tokens": 1408008509 }, { "epoch": 0.8915353838459615, "grad_norm": 0.9196054339408875, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14261, "tokens_per_second_per_gpu": 10674.84, "total_tokens": 1408107097 }, { "epoch": 0.8915978994748687, "grad_norm": 0.9237359762191772, "learning_rate": 2e-05, "loss": 0.6145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14262, "tokens_per_second_per_gpu": 9921.66, "total_tokens": 1408203469 }, { "epoch": 0.8916604151037759, "grad_norm": 0.8714516758918762, "learning_rate": 2e-05, "loss": 0.6013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14263, "tokens_per_second_per_gpu": 10120.33, "total_tokens": 1408301640 }, { "epoch": 0.8917229307326832, "grad_norm": 0.9580767750740051, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14264, "tokens_per_second_per_gpu": 9297.64, "total_tokens": 1408393314 }, { "epoch": 0.8917854463615904, "grad_norm": 0.9559311270713806, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14265, "tokens_per_second_per_gpu": 10515.84, "total_tokens": 1408492553 }, { "epoch": 0.8918479619904977, "grad_norm": 0.8821717500686646, "learning_rate": 2e-05, "loss": 0.631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14266, "tokens_per_second_per_gpu": 10518.46, "total_tokens": 1408592566 }, { "epoch": 0.8919104776194049, "grad_norm": 0.8968773484230042, "learning_rate": 2e-05, "loss": 0.6403, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14267, "tokens_per_second_per_gpu": 10492.4, "total_tokens": 1408692437 }, { "epoch": 0.891972993248312, "grad_norm": 0.9039214849472046, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14268, "tokens_per_second_per_gpu": 10756.16, "total_tokens": 1408792086 }, { "epoch": 0.8920355088772193, "grad_norm": 0.9038994908332825, "learning_rate": 2e-05, "loss": 0.6561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14269, "tokens_per_second_per_gpu": 10119.64, "total_tokens": 1408892409 }, { "epoch": 0.8920980245061265, "grad_norm": 0.8810228705406189, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14270, "tokens_per_second_per_gpu": 10632.42, "total_tokens": 1408990495 }, { "epoch": 0.8921605401350338, "grad_norm": 0.8775393962860107, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14271, "tokens_per_second_per_gpu": 10374.92, "total_tokens": 1409091192 }, { "epoch": 0.892223055763941, "grad_norm": 0.9082232713699341, "learning_rate": 2e-05, "loss": 0.6432, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14272, "tokens_per_second_per_gpu": 11220.98, "total_tokens": 1409193552 }, { "epoch": 0.8922855713928483, "grad_norm": 0.8550811409950256, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14273, "tokens_per_second_per_gpu": 11310.41, "total_tokens": 1409295838 }, { "epoch": 0.8923480870217554, "grad_norm": 0.9002624750137329, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14274, "tokens_per_second_per_gpu": 12668.12, "total_tokens": 1409393109 }, { "epoch": 0.8924106026506626, "grad_norm": 0.885386049747467, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14275, "tokens_per_second_per_gpu": 10613.61, "total_tokens": 1409492471 }, { "epoch": 0.8924731182795699, "grad_norm": 0.9334443211555481, "learning_rate": 2e-05, "loss": 0.664, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14276, "tokens_per_second_per_gpu": 11140.93, "total_tokens": 1409593727 }, { "epoch": 0.8925356339084771, "grad_norm": 0.9216626286506653, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14277, "tokens_per_second_per_gpu": 10632.34, "total_tokens": 1409693940 }, { "epoch": 0.8925981495373844, "grad_norm": 0.8795565366744995, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14278, "tokens_per_second_per_gpu": 10494.66, "total_tokens": 1409797586 }, { "epoch": 0.8926606651662916, "grad_norm": 0.9138122200965881, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14279, "tokens_per_second_per_gpu": 9547.36, "total_tokens": 1409891337 }, { "epoch": 0.8927231807951987, "grad_norm": 0.8995585441589355, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14280, "tokens_per_second_per_gpu": 10967.13, "total_tokens": 1409994804 }, { "epoch": 0.892785696424106, "grad_norm": 0.8944616913795471, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14281, "tokens_per_second_per_gpu": 9983.15, "total_tokens": 1410089264 }, { "epoch": 0.8928482120530132, "grad_norm": 0.8860112428665161, "learning_rate": 2e-05, "loss": 0.5728, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14282, "tokens_per_second_per_gpu": 11111.84, "total_tokens": 1410190243 }, { "epoch": 0.8929107276819205, "grad_norm": 0.9267714619636536, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14283, "tokens_per_second_per_gpu": 9749.98, "total_tokens": 1410284159 }, { "epoch": 0.8929732433108277, "grad_norm": 0.8800313472747803, "learning_rate": 2e-05, "loss": 0.5844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14284, "tokens_per_second_per_gpu": 10268.04, "total_tokens": 1410381107 }, { "epoch": 0.893035758939735, "grad_norm": 0.9190637469291687, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14285, "tokens_per_second_per_gpu": 10665.54, "total_tokens": 1410480882 }, { "epoch": 0.8930982745686422, "grad_norm": 0.9063791632652283, "learning_rate": 2e-05, "loss": 0.588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14286, "tokens_per_second_per_gpu": 10693.03, "total_tokens": 1410578540 }, { "epoch": 0.8931607901975493, "grad_norm": 0.8858833312988281, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14287, "tokens_per_second_per_gpu": 11318.67, "total_tokens": 1410683545 }, { "epoch": 0.8932233058264566, "grad_norm": 0.9003028273582458, "learning_rate": 2e-05, "loss": 0.6165, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14288, "tokens_per_second_per_gpu": 11012.96, "total_tokens": 1410780459 }, { "epoch": 0.8932858214553638, "grad_norm": 0.8908122181892395, "learning_rate": 2e-05, "loss": 0.5842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14289, "tokens_per_second_per_gpu": 10575.17, "total_tokens": 1410881567 }, { "epoch": 0.8933483370842711, "grad_norm": 0.8865557909011841, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14290, "tokens_per_second_per_gpu": 10660.51, "total_tokens": 1410977322 }, { "epoch": 0.8934108527131783, "grad_norm": 0.9100006222724915, "learning_rate": 2e-05, "loss": 0.6495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14291, "tokens_per_second_per_gpu": 10643.99, "total_tokens": 1411078951 }, { "epoch": 0.8934733683420856, "grad_norm": 0.9289383888244629, "learning_rate": 2e-05, "loss": 0.6792, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14292, "tokens_per_second_per_gpu": 10542.31, "total_tokens": 1411176938 }, { "epoch": 0.8935358839709927, "grad_norm": 0.8838597536087036, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14293, "tokens_per_second_per_gpu": 10240.3, "total_tokens": 1411275912 }, { "epoch": 0.8935983995999, "grad_norm": 0.8886645436286926, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14294, "tokens_per_second_per_gpu": 10380.84, "total_tokens": 1411374869 }, { "epoch": 0.8936609152288072, "grad_norm": 0.9470241069793701, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14295, "tokens_per_second_per_gpu": 10033.93, "total_tokens": 1411470670 }, { "epoch": 0.8937234308577144, "grad_norm": 0.9641224145889282, "learning_rate": 2e-05, "loss": 0.5967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14296, "tokens_per_second_per_gpu": 10647.96, "total_tokens": 1411570704 }, { "epoch": 0.8937859464866217, "grad_norm": 0.895880937576294, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14297, "tokens_per_second_per_gpu": 11506.95, "total_tokens": 1411671612 }, { "epoch": 0.8938484621155289, "grad_norm": 0.90863436460495, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14298, "tokens_per_second_per_gpu": 10753.09, "total_tokens": 1411771027 }, { "epoch": 0.8939109777444361, "grad_norm": 0.8819557428359985, "learning_rate": 2e-05, "loss": 0.5602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14299, "tokens_per_second_per_gpu": 9754.79, "total_tokens": 1411863155 }, { "epoch": 0.8939734933733433, "grad_norm": 0.8886189460754395, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14300, "tokens_per_second_per_gpu": 10482.04, "total_tokens": 1411965463 }, { "epoch": 0.8940360090022506, "grad_norm": 0.8890219330787659, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14301, "tokens_per_second_per_gpu": 10651.89, "total_tokens": 1412066428 }, { "epoch": 0.8940985246311578, "grad_norm": 0.8936896920204163, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14302, "tokens_per_second_per_gpu": 10222.81, "total_tokens": 1412166025 }, { "epoch": 0.894161040260065, "grad_norm": 0.8802706003189087, "learning_rate": 2e-05, "loss": 0.5649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14303, "tokens_per_second_per_gpu": 10386.43, "total_tokens": 1412263128 }, { "epoch": 0.8942235558889723, "grad_norm": 0.8730029463768005, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14304, "tokens_per_second_per_gpu": 10743.98, "total_tokens": 1412363708 }, { "epoch": 0.8942860715178794, "grad_norm": 0.9018836617469788, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14305, "tokens_per_second_per_gpu": 10987.74, "total_tokens": 1412463724 }, { "epoch": 0.8943485871467867, "grad_norm": 0.875934362411499, "learning_rate": 2e-05, "loss": 0.5767, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14306, "tokens_per_second_per_gpu": 11007.05, "total_tokens": 1412564262 }, { "epoch": 0.8944111027756939, "grad_norm": 0.8861980438232422, "learning_rate": 2e-05, "loss": 0.584, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14307, "tokens_per_second_per_gpu": 10071.0, "total_tokens": 1412658706 }, { "epoch": 0.8944736184046012, "grad_norm": 0.896977424621582, "learning_rate": 2e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14308, "tokens_per_second_per_gpu": 10434.73, "total_tokens": 1412755271 }, { "epoch": 0.8945361340335084, "grad_norm": 0.9143221378326416, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14309, "tokens_per_second_per_gpu": 10096.98, "total_tokens": 1412848670 }, { "epoch": 0.8945986496624156, "grad_norm": 0.9101408123970032, "learning_rate": 2e-05, "loss": 0.5833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14310, "tokens_per_second_per_gpu": 9504.32, "total_tokens": 1412942107 }, { "epoch": 0.8946611652913228, "grad_norm": 0.9168151617050171, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14311, "tokens_per_second_per_gpu": 11203.51, "total_tokens": 1413045196 }, { "epoch": 0.89472368092023, "grad_norm": 0.8853030204772949, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14312, "tokens_per_second_per_gpu": 10600.56, "total_tokens": 1413143617 }, { "epoch": 0.8947861965491373, "grad_norm": 0.9141972064971924, "learning_rate": 2e-05, "loss": 0.6105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14313, "tokens_per_second_per_gpu": 10687.83, "total_tokens": 1413240936 }, { "epoch": 0.8948487121780445, "grad_norm": 0.9573039412498474, "learning_rate": 2e-05, "loss": 0.6599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14314, "tokens_per_second_per_gpu": 11132.71, "total_tokens": 1413341255 }, { "epoch": 0.8949112278069518, "grad_norm": 0.8695572018623352, "learning_rate": 2e-05, "loss": 0.588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14315, "tokens_per_second_per_gpu": 10616.83, "total_tokens": 1413442476 }, { "epoch": 0.894973743435859, "grad_norm": 0.8881080150604248, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14316, "tokens_per_second_per_gpu": 10381.86, "total_tokens": 1413543102 }, { "epoch": 0.8950362590647661, "grad_norm": 0.9260581135749817, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14317, "tokens_per_second_per_gpu": 10295.45, "total_tokens": 1413642217 }, { "epoch": 0.8950987746936734, "grad_norm": 0.8632168769836426, "learning_rate": 2e-05, "loss": 0.5859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14318, "tokens_per_second_per_gpu": 10675.15, "total_tokens": 1413741519 }, { "epoch": 0.8951612903225806, "grad_norm": 0.9098095297813416, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14319, "tokens_per_second_per_gpu": 10463.66, "total_tokens": 1413841070 }, { "epoch": 0.8952238059514879, "grad_norm": 0.8856922388076782, "learning_rate": 2e-05, "loss": 0.638, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14320, "tokens_per_second_per_gpu": 10473.01, "total_tokens": 1413941159 }, { "epoch": 0.8952863215803951, "grad_norm": 0.8842040300369263, "learning_rate": 2e-05, "loss": 0.5994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14321, "tokens_per_second_per_gpu": 10403.12, "total_tokens": 1414037238 }, { "epoch": 0.8953488372093024, "grad_norm": 0.8755374550819397, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14322, "tokens_per_second_per_gpu": 11029.2, "total_tokens": 1414137584 }, { "epoch": 0.8954113528382096, "grad_norm": 0.8917557597160339, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14323, "tokens_per_second_per_gpu": 11165.67, "total_tokens": 1414236692 }, { "epoch": 0.8954738684671167, "grad_norm": 0.8955293297767639, "learning_rate": 2e-05, "loss": 0.5811, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14324, "tokens_per_second_per_gpu": 10189.37, "total_tokens": 1414331816 }, { "epoch": 0.895536384096024, "grad_norm": 0.8957864046096802, "learning_rate": 2e-05, "loss": 0.6459, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14325, "tokens_per_second_per_gpu": 10427.44, "total_tokens": 1414432100 }, { "epoch": 0.8955988997249312, "grad_norm": 0.9057884216308594, "learning_rate": 2e-05, "loss": 0.5867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14326, "tokens_per_second_per_gpu": 9373.98, "total_tokens": 1414528546 }, { "epoch": 0.8956614153538385, "grad_norm": 0.9027177691459656, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14327, "tokens_per_second_per_gpu": 9766.54, "total_tokens": 1414622204 }, { "epoch": 0.8957239309827457, "grad_norm": 0.8805027008056641, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14328, "tokens_per_second_per_gpu": 9934.74, "total_tokens": 1414719021 }, { "epoch": 0.895786446611653, "grad_norm": 0.884807288646698, "learning_rate": 2e-05, "loss": 0.5575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14329, "tokens_per_second_per_gpu": 10163.07, "total_tokens": 1414814537 }, { "epoch": 0.8958489622405601, "grad_norm": 0.908072292804718, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14330, "tokens_per_second_per_gpu": 10077.71, "total_tokens": 1414910921 }, { "epoch": 0.8959114778694673, "grad_norm": 0.9352265000343323, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14331, "tokens_per_second_per_gpu": 10169.44, "total_tokens": 1415006621 }, { "epoch": 0.8959739934983746, "grad_norm": 0.9064264297485352, "learning_rate": 2e-05, "loss": 0.5681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14332, "tokens_per_second_per_gpu": 10465.68, "total_tokens": 1415099621 }, { "epoch": 0.8960365091272818, "grad_norm": 0.875888466835022, "learning_rate": 2e-05, "loss": 0.5862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14333, "tokens_per_second_per_gpu": 10586.88, "total_tokens": 1415197370 }, { "epoch": 0.8960990247561891, "grad_norm": 0.9644646048545837, "learning_rate": 2e-05, "loss": 0.6397, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14334, "tokens_per_second_per_gpu": 10782.34, "total_tokens": 1415296769 }, { "epoch": 0.8961615403850963, "grad_norm": 0.8979285955429077, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14335, "tokens_per_second_per_gpu": 9729.91, "total_tokens": 1415394269 }, { "epoch": 0.8962240560140035, "grad_norm": 0.8870178461074829, "learning_rate": 2e-05, "loss": 0.5668, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14336, "tokens_per_second_per_gpu": 9756.63, "total_tokens": 1415487536 }, { "epoch": 0.8962865716429107, "grad_norm": 0.8393890857696533, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14337, "tokens_per_second_per_gpu": 10870.88, "total_tokens": 1415591484 }, { "epoch": 0.896349087271818, "grad_norm": 0.9227733016014099, "learning_rate": 2e-05, "loss": 0.6283, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14338, "tokens_per_second_per_gpu": 10187.28, "total_tokens": 1415691820 }, { "epoch": 0.8964116029007252, "grad_norm": 0.9035963416099548, "learning_rate": 2e-05, "loss": 0.6241, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14339, "tokens_per_second_per_gpu": 10966.74, "total_tokens": 1415793662 }, { "epoch": 0.8964741185296324, "grad_norm": 0.9548508524894714, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14340, "tokens_per_second_per_gpu": 10451.57, "total_tokens": 1415891896 }, { "epoch": 0.8965366341585397, "grad_norm": 0.917262077331543, "learning_rate": 2e-05, "loss": 0.599, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14341, "tokens_per_second_per_gpu": 9941.79, "total_tokens": 1415983234 }, { "epoch": 0.8965991497874468, "grad_norm": 0.9342786073684692, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14342, "tokens_per_second_per_gpu": 10672.27, "total_tokens": 1416081935 }, { "epoch": 0.8966616654163541, "grad_norm": 0.8532731533050537, "learning_rate": 2e-05, "loss": 0.5819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14343, "tokens_per_second_per_gpu": 10371.59, "total_tokens": 1416181593 }, { "epoch": 0.8967241810452613, "grad_norm": 0.9054906368255615, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14344, "tokens_per_second_per_gpu": 9653.79, "total_tokens": 1416277197 }, { "epoch": 0.8967866966741685, "grad_norm": 0.8956854343414307, "learning_rate": 2e-05, "loss": 0.6131, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14345, "tokens_per_second_per_gpu": 10395.23, "total_tokens": 1416373878 }, { "epoch": 0.8968492123030758, "grad_norm": 0.9120367765426636, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14346, "tokens_per_second_per_gpu": 11057.93, "total_tokens": 1416474759 }, { "epoch": 0.896911727931983, "grad_norm": 0.8599949479103088, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14347, "tokens_per_second_per_gpu": 10108.25, "total_tokens": 1416572252 }, { "epoch": 0.8969742435608902, "grad_norm": 0.9199883341789246, "learning_rate": 2e-05, "loss": 0.5884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14348, "tokens_per_second_per_gpu": 9672.43, "total_tokens": 1416660618 }, { "epoch": 0.8970367591897974, "grad_norm": 0.8637748956680298, "learning_rate": 2e-05, "loss": 0.6477, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14349, "tokens_per_second_per_gpu": 11083.04, "total_tokens": 1416763902 }, { "epoch": 0.8970992748187047, "grad_norm": 0.876133143901825, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14350, "tokens_per_second_per_gpu": 10721.98, "total_tokens": 1416863427 }, { "epoch": 0.8971617904476119, "grad_norm": 0.914306640625, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14351, "tokens_per_second_per_gpu": 10735.06, "total_tokens": 1416958610 }, { "epoch": 0.8972243060765192, "grad_norm": 1.2161924839019775, "learning_rate": 2e-05, "loss": 0.6118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14352, "tokens_per_second_per_gpu": 10441.98, "total_tokens": 1417059232 }, { "epoch": 0.8972868217054264, "grad_norm": 0.8389318585395813, "learning_rate": 2e-05, "loss": 0.6214, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14353, "tokens_per_second_per_gpu": 11180.43, "total_tokens": 1417162936 }, { "epoch": 0.8973493373343335, "grad_norm": 0.8886899352073669, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14354, "tokens_per_second_per_gpu": 10496.81, "total_tokens": 1417261758 }, { "epoch": 0.8974118529632408, "grad_norm": 0.8380928039550781, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14355, "tokens_per_second_per_gpu": 11165.67, "total_tokens": 1417364869 }, { "epoch": 0.897474368592148, "grad_norm": 0.9879767298698425, "learning_rate": 2e-05, "loss": 0.614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14356, "tokens_per_second_per_gpu": 11030.7, "total_tokens": 1417467812 }, { "epoch": 0.8975368842210553, "grad_norm": 0.921324610710144, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14357, "tokens_per_second_per_gpu": 9967.45, "total_tokens": 1417562694 }, { "epoch": 0.8975993998499625, "grad_norm": 0.8975562453269958, "learning_rate": 2e-05, "loss": 0.5831, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14358, "tokens_per_second_per_gpu": 10103.96, "total_tokens": 1417654335 }, { "epoch": 0.8976619154788698, "grad_norm": 0.9175743460655212, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14359, "tokens_per_second_per_gpu": 11189.5, "total_tokens": 1417755014 }, { "epoch": 0.897724431107777, "grad_norm": 0.8751163482666016, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14360, "tokens_per_second_per_gpu": 10529.4, "total_tokens": 1417855649 }, { "epoch": 0.8977869467366841, "grad_norm": 0.9369186162948608, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14361, "tokens_per_second_per_gpu": 11044.08, "total_tokens": 1417956099 }, { "epoch": 0.8978494623655914, "grad_norm": 0.937623918056488, "learning_rate": 2e-05, "loss": 0.6467, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14362, "tokens_per_second_per_gpu": 9757.97, "total_tokens": 1418049244 }, { "epoch": 0.8979119779944986, "grad_norm": 0.8576506972312927, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14363, "tokens_per_second_per_gpu": 10888.82, "total_tokens": 1418147977 }, { "epoch": 0.8979744936234059, "grad_norm": 0.8944942951202393, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14364, "tokens_per_second_per_gpu": 9581.77, "total_tokens": 1418242876 }, { "epoch": 0.8980370092523131, "grad_norm": 0.9087269902229309, "learning_rate": 2e-05, "loss": 0.6396, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14365, "tokens_per_second_per_gpu": 10350.17, "total_tokens": 1418345137 }, { "epoch": 0.8980995248812204, "grad_norm": 0.9263179302215576, "learning_rate": 2e-05, "loss": 0.6362, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14366, "tokens_per_second_per_gpu": 10154.35, "total_tokens": 1418443013 }, { "epoch": 0.8981620405101275, "grad_norm": 0.8899837136268616, "learning_rate": 2e-05, "loss": 0.5927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14367, "tokens_per_second_per_gpu": 10196.2, "total_tokens": 1418538470 }, { "epoch": 0.8982245561390347, "grad_norm": 0.8618783354759216, "learning_rate": 2e-05, "loss": 0.5842, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14368, "tokens_per_second_per_gpu": 10294.28, "total_tokens": 1418635771 }, { "epoch": 0.898287071767942, "grad_norm": 0.8862248063087463, "learning_rate": 2e-05, "loss": 0.566, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14369, "tokens_per_second_per_gpu": 9471.41, "total_tokens": 1418725270 }, { "epoch": 0.8983495873968492, "grad_norm": 0.8803583383560181, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14370, "tokens_per_second_per_gpu": 11114.66, "total_tokens": 1418823284 }, { "epoch": 0.8984121030257565, "grad_norm": 0.8879116177558899, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14371, "tokens_per_second_per_gpu": 10968.38, "total_tokens": 1418922522 }, { "epoch": 0.8984746186546637, "grad_norm": 0.9344585537910461, "learning_rate": 2e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14372, "tokens_per_second_per_gpu": 9979.98, "total_tokens": 1419018149 }, { "epoch": 0.8985371342835708, "grad_norm": 0.8813170194625854, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14373, "tokens_per_second_per_gpu": 11199.03, "total_tokens": 1419118787 }, { "epoch": 0.8985996499124781, "grad_norm": 0.8856033682823181, "learning_rate": 2e-05, "loss": 0.5956, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14374, "tokens_per_second_per_gpu": 11697.98, "total_tokens": 1419219095 }, { "epoch": 0.8986621655413853, "grad_norm": 0.9135347604751587, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14375, "tokens_per_second_per_gpu": 10407.81, "total_tokens": 1419315773 }, { "epoch": 0.8987246811702926, "grad_norm": 0.9154389500617981, "learning_rate": 2e-05, "loss": 0.6041, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14376, "tokens_per_second_per_gpu": 10573.01, "total_tokens": 1419408639 }, { "epoch": 0.8987871967991998, "grad_norm": 0.8836377263069153, "learning_rate": 2e-05, "loss": 0.5882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14377, "tokens_per_second_per_gpu": 9726.57, "total_tokens": 1419504219 }, { "epoch": 0.8988497124281071, "grad_norm": 0.9031068086624146, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14378, "tokens_per_second_per_gpu": 9529.91, "total_tokens": 1419598560 }, { "epoch": 0.8989122280570142, "grad_norm": 0.8816694617271423, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14379, "tokens_per_second_per_gpu": 9982.42, "total_tokens": 1419695981 }, { "epoch": 0.8989747436859215, "grad_norm": 0.9253253936767578, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14380, "tokens_per_second_per_gpu": 10693.87, "total_tokens": 1419795571 }, { "epoch": 0.8990372593148287, "grad_norm": 0.9082975387573242, "learning_rate": 2e-05, "loss": 0.6126, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14381, "tokens_per_second_per_gpu": 9606.29, "total_tokens": 1419890410 }, { "epoch": 0.8990997749437359, "grad_norm": 0.9241147637367249, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14382, "tokens_per_second_per_gpu": 11144.78, "total_tokens": 1419984643 }, { "epoch": 0.8991622905726432, "grad_norm": 0.9195964932441711, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14383, "tokens_per_second_per_gpu": 9927.97, "total_tokens": 1420076474 }, { "epoch": 0.8992248062015504, "grad_norm": 0.9049530029296875, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14384, "tokens_per_second_per_gpu": 10533.88, "total_tokens": 1420175108 }, { "epoch": 0.8992873218304576, "grad_norm": 0.8920995593070984, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14385, "tokens_per_second_per_gpu": 9986.36, "total_tokens": 1420271556 }, { "epoch": 0.8993498374593648, "grad_norm": 0.9180745482444763, "learning_rate": 2e-05, "loss": 0.5963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14386, "tokens_per_second_per_gpu": 9446.23, "total_tokens": 1420368072 }, { "epoch": 0.899412353088272, "grad_norm": 0.8656790852546692, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14387, "tokens_per_second_per_gpu": 10362.62, "total_tokens": 1420467958 }, { "epoch": 0.8994748687171793, "grad_norm": 0.9551048874855042, "learning_rate": 2e-05, "loss": 0.6282, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14388, "tokens_per_second_per_gpu": 10495.83, "total_tokens": 1420566796 }, { "epoch": 0.8995373843460865, "grad_norm": 0.869859516620636, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14389, "tokens_per_second_per_gpu": 10794.06, "total_tokens": 1420662934 }, { "epoch": 0.8995998999749938, "grad_norm": 0.8668910264968872, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14390, "tokens_per_second_per_gpu": 10602.48, "total_tokens": 1420762353 }, { "epoch": 0.8996624156039009, "grad_norm": 0.8821693062782288, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14391, "tokens_per_second_per_gpu": 11119.5, "total_tokens": 1420865094 }, { "epoch": 0.8997249312328082, "grad_norm": 0.8696194291114807, "learning_rate": 2e-05, "loss": 0.5935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14392, "tokens_per_second_per_gpu": 11112.87, "total_tokens": 1420961971 }, { "epoch": 0.8997874468617154, "grad_norm": 0.9095207452774048, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14393, "tokens_per_second_per_gpu": 10463.19, "total_tokens": 1421059030 }, { "epoch": 0.8998499624906227, "grad_norm": 0.8781971335411072, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14394, "tokens_per_second_per_gpu": 10060.25, "total_tokens": 1421152428 }, { "epoch": 0.8999124781195299, "grad_norm": 0.9008468985557556, "learning_rate": 2e-05, "loss": 0.5975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14395, "tokens_per_second_per_gpu": 9981.44, "total_tokens": 1421245182 }, { "epoch": 0.8999749937484371, "grad_norm": 0.9050384759902954, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14396, "tokens_per_second_per_gpu": 10688.59, "total_tokens": 1421342797 }, { "epoch": 0.9000375093773444, "grad_norm": 0.9023698568344116, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14397, "tokens_per_second_per_gpu": 10769.16, "total_tokens": 1421442777 }, { "epoch": 0.9001000250062515, "grad_norm": 0.9190404415130615, "learning_rate": 2e-05, "loss": 0.591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14398, "tokens_per_second_per_gpu": 9929.32, "total_tokens": 1421535339 }, { "epoch": 0.9001625406351588, "grad_norm": 0.8731198310852051, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14399, "tokens_per_second_per_gpu": 10085.64, "total_tokens": 1421631976 }, { "epoch": 0.900225056264066, "grad_norm": 1.0769933462142944, "learning_rate": 2e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14400, "tokens_per_second_per_gpu": 10530.58, "total_tokens": 1421730835 }, { "epoch": 0.9002875718929733, "grad_norm": 0.8892952799797058, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14401, "tokens_per_second_per_gpu": 9782.05, "total_tokens": 1421829721 }, { "epoch": 0.9003500875218805, "grad_norm": 0.8542103171348572, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14402, "tokens_per_second_per_gpu": 9637.8, "total_tokens": 1421923577 }, { "epoch": 0.9004126031507877, "grad_norm": 0.8341872096061707, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14403, "tokens_per_second_per_gpu": 10858.34, "total_tokens": 1422024775 }, { "epoch": 0.9004751187796949, "grad_norm": 0.8871641159057617, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14404, "tokens_per_second_per_gpu": 10421.81, "total_tokens": 1422124133 }, { "epoch": 0.9005376344086021, "grad_norm": 0.9185689687728882, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14405, "tokens_per_second_per_gpu": 9864.29, "total_tokens": 1422221498 }, { "epoch": 0.9006001500375094, "grad_norm": 0.8812651038169861, "learning_rate": 2e-05, "loss": 0.6614, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14406, "tokens_per_second_per_gpu": 10883.35, "total_tokens": 1422321443 }, { "epoch": 0.9006626656664166, "grad_norm": 0.8680435419082642, "learning_rate": 2e-05, "loss": 0.6164, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14407, "tokens_per_second_per_gpu": 10442.1, "total_tokens": 1422419517 }, { "epoch": 0.9007251812953239, "grad_norm": 0.9224543571472168, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14408, "tokens_per_second_per_gpu": 10477.1, "total_tokens": 1422519529 }, { "epoch": 0.9007876969242311, "grad_norm": 0.880953848361969, "learning_rate": 2e-05, "loss": 0.6046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14409, "tokens_per_second_per_gpu": 10042.35, "total_tokens": 1422618149 }, { "epoch": 0.9008502125531382, "grad_norm": 0.8727061152458191, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14410, "tokens_per_second_per_gpu": 10664.91, "total_tokens": 1422716578 }, { "epoch": 0.9009127281820455, "grad_norm": 0.9295724630355835, "learning_rate": 2e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14411, "tokens_per_second_per_gpu": 9753.23, "total_tokens": 1422815451 }, { "epoch": 0.9009752438109527, "grad_norm": 0.8947943449020386, "learning_rate": 2e-05, "loss": 0.5791, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14412, "tokens_per_second_per_gpu": 10809.08, "total_tokens": 1422912749 }, { "epoch": 0.90103775943986, "grad_norm": 0.945025622844696, "learning_rate": 2e-05, "loss": 0.6901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14413, "tokens_per_second_per_gpu": 10444.55, "total_tokens": 1423008753 }, { "epoch": 0.9011002750687672, "grad_norm": 0.9041930437088013, "learning_rate": 2e-05, "loss": 0.6436, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14414, "tokens_per_second_per_gpu": 10476.45, "total_tokens": 1423107641 }, { "epoch": 0.9011627906976745, "grad_norm": 0.9015105962753296, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14415, "tokens_per_second_per_gpu": 10538.3, "total_tokens": 1423204491 }, { "epoch": 0.9012253063265816, "grad_norm": 0.9198715090751648, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14416, "tokens_per_second_per_gpu": 10640.68, "total_tokens": 1423304574 }, { "epoch": 0.9012878219554888, "grad_norm": 0.8701664209365845, "learning_rate": 2e-05, "loss": 0.58, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14417, "tokens_per_second_per_gpu": 11084.75, "total_tokens": 1423405596 }, { "epoch": 0.9013503375843961, "grad_norm": 0.9459871649742126, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14418, "tokens_per_second_per_gpu": 10961.33, "total_tokens": 1423503950 }, { "epoch": 0.9014128532133033, "grad_norm": 0.9113780856132507, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14419, "tokens_per_second_per_gpu": 10592.94, "total_tokens": 1423601583 }, { "epoch": 0.9014753688422106, "grad_norm": 0.8883593678474426, "learning_rate": 2e-05, "loss": 0.595, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14420, "tokens_per_second_per_gpu": 10198.03, "total_tokens": 1423696224 }, { "epoch": 0.9015378844711178, "grad_norm": 0.917093813419342, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14421, "tokens_per_second_per_gpu": 9405.58, "total_tokens": 1423791295 }, { "epoch": 0.901600400100025, "grad_norm": 0.9372345209121704, "learning_rate": 2e-05, "loss": 0.5896, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14422, "tokens_per_second_per_gpu": 9736.96, "total_tokens": 1423885813 }, { "epoch": 0.9016629157289322, "grad_norm": 0.9886389970779419, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14423, "tokens_per_second_per_gpu": 10515.86, "total_tokens": 1423985812 }, { "epoch": 0.9017254313578394, "grad_norm": 0.9059215188026428, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14424, "tokens_per_second_per_gpu": 10387.26, "total_tokens": 1424082971 }, { "epoch": 0.9017879469867467, "grad_norm": 0.89494788646698, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14425, "tokens_per_second_per_gpu": 11141.21, "total_tokens": 1424182894 }, { "epoch": 0.9018504626156539, "grad_norm": 0.8862996101379395, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14426, "tokens_per_second_per_gpu": 10795.9, "total_tokens": 1424282639 }, { "epoch": 0.9019129782445612, "grad_norm": 0.8655632734298706, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14427, "tokens_per_second_per_gpu": 10604.36, "total_tokens": 1424381068 }, { "epoch": 0.9019754938734683, "grad_norm": 0.9306802749633789, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14428, "tokens_per_second_per_gpu": 10042.14, "total_tokens": 1424478240 }, { "epoch": 0.9020380095023756, "grad_norm": 0.9459813237190247, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14429, "tokens_per_second_per_gpu": 10890.05, "total_tokens": 1424574512 }, { "epoch": 0.9021005251312828, "grad_norm": 0.959034264087677, "learning_rate": 2e-05, "loss": 0.6395, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14430, "tokens_per_second_per_gpu": 10731.08, "total_tokens": 1424671698 }, { "epoch": 0.90216304076019, "grad_norm": 0.8616399168968201, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14431, "tokens_per_second_per_gpu": 11008.74, "total_tokens": 1424772269 }, { "epoch": 0.9022255563890973, "grad_norm": 0.8987058401107788, "learning_rate": 2e-05, "loss": 0.6461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14432, "tokens_per_second_per_gpu": 10475.17, "total_tokens": 1424869224 }, { "epoch": 0.9022880720180045, "grad_norm": 0.8774735927581787, "learning_rate": 2e-05, "loss": 0.5972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14433, "tokens_per_second_per_gpu": 10802.92, "total_tokens": 1424970675 }, { "epoch": 0.9023505876469117, "grad_norm": 0.9320708513259888, "learning_rate": 2e-05, "loss": 0.6515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14434, "tokens_per_second_per_gpu": 10980.3, "total_tokens": 1425071792 }, { "epoch": 0.9024131032758189, "grad_norm": 0.8757410049438477, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14435, "tokens_per_second_per_gpu": 10073.36, "total_tokens": 1425170721 }, { "epoch": 0.9024756189047262, "grad_norm": 0.9051724076271057, "learning_rate": 2e-05, "loss": 0.5774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14436, "tokens_per_second_per_gpu": 9705.39, "total_tokens": 1425261137 }, { "epoch": 0.9025381345336334, "grad_norm": 0.9430962800979614, "learning_rate": 2e-05, "loss": 0.6515, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14437, "tokens_per_second_per_gpu": 10384.7, "total_tokens": 1425358800 }, { "epoch": 0.9026006501625407, "grad_norm": 0.8641453981399536, "learning_rate": 2e-05, "loss": 0.6145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14438, "tokens_per_second_per_gpu": 10678.81, "total_tokens": 1425458524 }, { "epoch": 0.9026631657914479, "grad_norm": 0.912844717502594, "learning_rate": 2e-05, "loss": 0.6455, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14439, "tokens_per_second_per_gpu": 9818.94, "total_tokens": 1425554522 }, { "epoch": 0.9027256814203551, "grad_norm": 0.9202476143836975, "learning_rate": 2e-05, "loss": 0.6427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14440, "tokens_per_second_per_gpu": 10508.69, "total_tokens": 1425656180 }, { "epoch": 0.9027881970492623, "grad_norm": 0.9474259614944458, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14441, "tokens_per_second_per_gpu": 9605.97, "total_tokens": 1425749491 }, { "epoch": 0.9028507126781695, "grad_norm": 0.9127195477485657, "learning_rate": 2e-05, "loss": 0.6588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14442, "tokens_per_second_per_gpu": 11822.12, "total_tokens": 1425851380 }, { "epoch": 0.9029132283070768, "grad_norm": 0.8965187668800354, "learning_rate": 2e-05, "loss": 0.5763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14443, "tokens_per_second_per_gpu": 10134.16, "total_tokens": 1425947360 }, { "epoch": 0.902975743935984, "grad_norm": 0.9846962690353394, "learning_rate": 2e-05, "loss": 0.6016, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14444, "tokens_per_second_per_gpu": 10440.29, "total_tokens": 1426043086 }, { "epoch": 0.9030382595648913, "grad_norm": 0.9239744544029236, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14445, "tokens_per_second_per_gpu": 10453.38, "total_tokens": 1426145884 }, { "epoch": 0.9031007751937985, "grad_norm": 0.8727784752845764, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14446, "tokens_per_second_per_gpu": 10317.75, "total_tokens": 1426243787 }, { "epoch": 0.9031632908227056, "grad_norm": 0.8686871528625488, "learning_rate": 2e-05, "loss": 0.6116, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14447, "tokens_per_second_per_gpu": 11119.13, "total_tokens": 1426344494 }, { "epoch": 0.9032258064516129, "grad_norm": 0.869600236415863, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14448, "tokens_per_second_per_gpu": 10472.54, "total_tokens": 1426442898 }, { "epoch": 0.9032883220805201, "grad_norm": 0.9122861623764038, "learning_rate": 2e-05, "loss": 0.5892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14449, "tokens_per_second_per_gpu": 10074.94, "total_tokens": 1426535776 }, { "epoch": 0.9033508377094274, "grad_norm": 0.8925975561141968, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14450, "tokens_per_second_per_gpu": 10751.95, "total_tokens": 1426636378 }, { "epoch": 0.9034133533383346, "grad_norm": 0.9207896590232849, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14451, "tokens_per_second_per_gpu": 10580.83, "total_tokens": 1426739110 }, { "epoch": 0.9034758689672419, "grad_norm": 0.9179565906524658, "learning_rate": 2e-05, "loss": 0.6624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14452, "tokens_per_second_per_gpu": 11076.0, "total_tokens": 1426839478 }, { "epoch": 0.903538384596149, "grad_norm": 0.9186690449714661, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14453, "tokens_per_second_per_gpu": 10902.32, "total_tokens": 1426939913 }, { "epoch": 0.9036009002250562, "grad_norm": 0.886408269405365, "learning_rate": 2e-05, "loss": 0.5972, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14454, "tokens_per_second_per_gpu": 9848.08, "total_tokens": 1427034994 }, { "epoch": 0.9036634158539635, "grad_norm": 0.9400404095649719, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14455, "tokens_per_second_per_gpu": 10609.69, "total_tokens": 1427130875 }, { "epoch": 0.9037259314828707, "grad_norm": 0.9096439480781555, "learning_rate": 2e-05, "loss": 0.6124, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14456, "tokens_per_second_per_gpu": 10704.42, "total_tokens": 1427230025 }, { "epoch": 0.903788447111778, "grad_norm": 0.9409228563308716, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14457, "tokens_per_second_per_gpu": 10160.85, "total_tokens": 1427322320 }, { "epoch": 0.9038509627406852, "grad_norm": 0.8816423416137695, "learning_rate": 2e-05, "loss": 0.6131, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14458, "tokens_per_second_per_gpu": 9896.18, "total_tokens": 1427419482 }, { "epoch": 0.9039134783695923, "grad_norm": 0.904606819152832, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14459, "tokens_per_second_per_gpu": 11049.79, "total_tokens": 1427519050 }, { "epoch": 0.9039759939984996, "grad_norm": 0.9214392900466919, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14460, "tokens_per_second_per_gpu": 10755.26, "total_tokens": 1427619380 }, { "epoch": 0.9040385096274068, "grad_norm": 0.8964712023735046, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14461, "tokens_per_second_per_gpu": 10521.53, "total_tokens": 1427716843 }, { "epoch": 0.9041010252563141, "grad_norm": 0.9095097184181213, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14462, "tokens_per_second_per_gpu": 11033.28, "total_tokens": 1427813694 }, { "epoch": 0.9041635408852213, "grad_norm": 0.8722051978111267, "learning_rate": 2e-05, "loss": 0.6422, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14463, "tokens_per_second_per_gpu": 10548.62, "total_tokens": 1427913304 }, { "epoch": 0.9042260565141286, "grad_norm": 0.8750486373901367, "learning_rate": 2e-05, "loss": 0.5936, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14464, "tokens_per_second_per_gpu": 10663.22, "total_tokens": 1428011129 }, { "epoch": 0.9042885721430357, "grad_norm": 0.8823903203010559, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14465, "tokens_per_second_per_gpu": 10008.34, "total_tokens": 1428108946 }, { "epoch": 0.904351087771943, "grad_norm": 0.9132736325263977, "learning_rate": 2e-05, "loss": 0.5927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14466, "tokens_per_second_per_gpu": 10202.01, "total_tokens": 1428207072 }, { "epoch": 0.9044136034008502, "grad_norm": 0.903078019618988, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14467, "tokens_per_second_per_gpu": 9583.67, "total_tokens": 1428304879 }, { "epoch": 0.9044761190297574, "grad_norm": 0.8925055265426636, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14468, "tokens_per_second_per_gpu": 10970.76, "total_tokens": 1428399306 }, { "epoch": 0.9045386346586647, "grad_norm": 0.8884062767028809, "learning_rate": 2e-05, "loss": 0.6461, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14469, "tokens_per_second_per_gpu": 10953.67, "total_tokens": 1428500272 }, { "epoch": 0.9046011502875719, "grad_norm": 0.8817853331565857, "learning_rate": 2e-05, "loss": 0.5373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14470, "tokens_per_second_per_gpu": 10029.88, "total_tokens": 1428592422 }, { "epoch": 0.9046636659164791, "grad_norm": 0.8746564388275146, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14471, "tokens_per_second_per_gpu": 11211.25, "total_tokens": 1428693696 }, { "epoch": 0.9047261815453863, "grad_norm": 0.8577413558959961, "learning_rate": 2e-05, "loss": 0.6012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14472, "tokens_per_second_per_gpu": 10202.49, "total_tokens": 1428792738 }, { "epoch": 0.9047886971742936, "grad_norm": 0.8902454376220703, "learning_rate": 2e-05, "loss": 0.5707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14473, "tokens_per_second_per_gpu": 10504.7, "total_tokens": 1428889309 }, { "epoch": 0.9048512128032008, "grad_norm": 0.8789322376251221, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14474, "tokens_per_second_per_gpu": 10438.1, "total_tokens": 1428989057 }, { "epoch": 0.904913728432108, "grad_norm": 0.9076630473136902, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14475, "tokens_per_second_per_gpu": 9746.23, "total_tokens": 1429081622 }, { "epoch": 0.9049762440610153, "grad_norm": 0.9240155220031738, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14476, "tokens_per_second_per_gpu": 9587.55, "total_tokens": 1429175540 }, { "epoch": 0.9050387596899225, "grad_norm": 0.8826929926872253, "learning_rate": 2e-05, "loss": 0.5678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14477, "tokens_per_second_per_gpu": 9764.61, "total_tokens": 1429267690 }, { "epoch": 0.9051012753188297, "grad_norm": 0.8860317468643188, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14478, "tokens_per_second_per_gpu": 10406.25, "total_tokens": 1429366873 }, { "epoch": 0.9051637909477369, "grad_norm": 0.8997532725334167, "learning_rate": 2e-05, "loss": 0.6325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14479, "tokens_per_second_per_gpu": 10662.75, "total_tokens": 1429464405 }, { "epoch": 0.9052263065766442, "grad_norm": 0.8826406002044678, "learning_rate": 2e-05, "loss": 0.554, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14480, "tokens_per_second_per_gpu": 9680.04, "total_tokens": 1429557687 }, { "epoch": 0.9052888222055514, "grad_norm": 0.9134086966514587, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14481, "tokens_per_second_per_gpu": 9963.09, "total_tokens": 1429652879 }, { "epoch": 0.9053513378344586, "grad_norm": 0.8698821067810059, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14482, "tokens_per_second_per_gpu": 10531.04, "total_tokens": 1429754758 }, { "epoch": 0.9054138534633659, "grad_norm": 0.8930383324623108, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14483, "tokens_per_second_per_gpu": 10353.57, "total_tokens": 1429854259 }, { "epoch": 0.905476369092273, "grad_norm": 0.9089725613594055, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14484, "tokens_per_second_per_gpu": 9926.64, "total_tokens": 1429949932 }, { "epoch": 0.9055388847211803, "grad_norm": 0.9089100360870361, "learning_rate": 2e-05, "loss": 0.5658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14485, "tokens_per_second_per_gpu": 9575.96, "total_tokens": 1430038741 }, { "epoch": 0.9056014003500875, "grad_norm": 0.8893977403640747, "learning_rate": 2e-05, "loss": 0.5959, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14486, "tokens_per_second_per_gpu": 9721.83, "total_tokens": 1430134789 }, { "epoch": 0.9056639159789948, "grad_norm": 0.9032770991325378, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14487, "tokens_per_second_per_gpu": 10709.29, "total_tokens": 1430229135 }, { "epoch": 0.905726431607902, "grad_norm": 0.8963657021522522, "learning_rate": 2e-05, "loss": 0.6095, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14488, "tokens_per_second_per_gpu": 9578.6, "total_tokens": 1430321620 }, { "epoch": 0.9057889472368092, "grad_norm": 0.923452615737915, "learning_rate": 2e-05, "loss": 0.6376, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14489, "tokens_per_second_per_gpu": 11157.59, "total_tokens": 1430421622 }, { "epoch": 0.9058514628657164, "grad_norm": 0.9159662127494812, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14490, "tokens_per_second_per_gpu": 10914.16, "total_tokens": 1430520106 }, { "epoch": 0.9059139784946236, "grad_norm": 0.9205392599105835, "learning_rate": 2e-05, "loss": 0.6525, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14491, "tokens_per_second_per_gpu": 10691.42, "total_tokens": 1430615570 }, { "epoch": 0.9059764941235309, "grad_norm": 0.8987693786621094, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14492, "tokens_per_second_per_gpu": 10811.82, "total_tokens": 1430714923 }, { "epoch": 0.9060390097524381, "grad_norm": 0.9106598496437073, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14493, "tokens_per_second_per_gpu": 10119.17, "total_tokens": 1430814025 }, { "epoch": 0.9061015253813454, "grad_norm": 0.9143227934837341, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14494, "tokens_per_second_per_gpu": 9703.19, "total_tokens": 1430911885 }, { "epoch": 0.9061640410102526, "grad_norm": 0.883887529373169, "learning_rate": 2e-05, "loss": 0.5622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14495, "tokens_per_second_per_gpu": 9783.79, "total_tokens": 1431006482 }, { "epoch": 0.9062265566391597, "grad_norm": 0.9154666662216187, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14496, "tokens_per_second_per_gpu": 9820.24, "total_tokens": 1431100012 }, { "epoch": 0.906289072268067, "grad_norm": 0.9356397390365601, "learning_rate": 2e-05, "loss": 0.5917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14497, "tokens_per_second_per_gpu": 10183.43, "total_tokens": 1431191105 }, { "epoch": 0.9063515878969742, "grad_norm": 0.8882889747619629, "learning_rate": 2e-05, "loss": 0.5853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14498, "tokens_per_second_per_gpu": 10574.38, "total_tokens": 1431288435 }, { "epoch": 0.9064141035258815, "grad_norm": 0.9177926182746887, "learning_rate": 2e-05, "loss": 0.6591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14499, "tokens_per_second_per_gpu": 10877.38, "total_tokens": 1431388702 }, { "epoch": 0.9064766191547887, "grad_norm": 0.925163745880127, "learning_rate": 2e-05, "loss": 0.5783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14500, "tokens_per_second_per_gpu": 10487.28, "total_tokens": 1431485945 }, { "epoch": 0.906539134783696, "grad_norm": 0.9095767736434937, "learning_rate": 2e-05, "loss": 0.5952, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14501, "tokens_per_second_per_gpu": 9680.69, "total_tokens": 1431580050 }, { "epoch": 0.9066016504126031, "grad_norm": 0.8820641040802002, "learning_rate": 2e-05, "loss": 0.6097, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14502, "tokens_per_second_per_gpu": 10484.01, "total_tokens": 1431679876 }, { "epoch": 0.9066641660415103, "grad_norm": 0.9346299767494202, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14503, "tokens_per_second_per_gpu": 10441.22, "total_tokens": 1431772167 }, { "epoch": 0.9067266816704176, "grad_norm": 0.8570979833602905, "learning_rate": 2e-05, "loss": 0.6266, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14504, "tokens_per_second_per_gpu": 10811.5, "total_tokens": 1431874991 }, { "epoch": 0.9067891972993248, "grad_norm": 0.9182137846946716, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14505, "tokens_per_second_per_gpu": 10198.21, "total_tokens": 1431969904 }, { "epoch": 0.9068517129282321, "grad_norm": 0.9136247634887695, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14506, "tokens_per_second_per_gpu": 10585.51, "total_tokens": 1432065822 }, { "epoch": 0.9069142285571393, "grad_norm": 0.8623942136764526, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14507, "tokens_per_second_per_gpu": 10551.14, "total_tokens": 1432162736 }, { "epoch": 0.9069767441860465, "grad_norm": 0.8624029755592346, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14508, "tokens_per_second_per_gpu": 10799.94, "total_tokens": 1432261814 }, { "epoch": 0.9070392598149537, "grad_norm": 0.8807697296142578, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14509, "tokens_per_second_per_gpu": 10004.41, "total_tokens": 1432358507 }, { "epoch": 0.907101775443861, "grad_norm": 0.8552743196487427, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14510, "tokens_per_second_per_gpu": 10574.19, "total_tokens": 1432454868 }, { "epoch": 0.9071642910727682, "grad_norm": 0.8926974534988403, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14511, "tokens_per_second_per_gpu": 10404.7, "total_tokens": 1432550810 }, { "epoch": 0.9072268067016754, "grad_norm": 0.9223540425300598, "learning_rate": 2e-05, "loss": 0.5821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14512, "tokens_per_second_per_gpu": 13547.92, "total_tokens": 1432643439 }, { "epoch": 0.9072893223305827, "grad_norm": 0.8891322016716003, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14513, "tokens_per_second_per_gpu": 11040.32, "total_tokens": 1432747249 }, { "epoch": 0.9073518379594899, "grad_norm": 0.8937302827835083, "learning_rate": 2e-05, "loss": 0.6299, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14514, "tokens_per_second_per_gpu": 9992.83, "total_tokens": 1432841616 }, { "epoch": 0.9074143535883971, "grad_norm": 0.9058197736740112, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14515, "tokens_per_second_per_gpu": 9985.87, "total_tokens": 1432940622 }, { "epoch": 0.9074768692173043, "grad_norm": 0.8990795016288757, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14516, "tokens_per_second_per_gpu": 10789.44, "total_tokens": 1433038651 }, { "epoch": 0.9075393848462115, "grad_norm": 0.897575855255127, "learning_rate": 2e-05, "loss": 0.6751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14517, "tokens_per_second_per_gpu": 10803.89, "total_tokens": 1433140726 }, { "epoch": 0.9076019004751188, "grad_norm": 0.8995514512062073, "learning_rate": 2e-05, "loss": 0.6256, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14518, "tokens_per_second_per_gpu": 9788.94, "total_tokens": 1433237685 }, { "epoch": 0.907664416104026, "grad_norm": 0.8668937087059021, "learning_rate": 2e-05, "loss": 0.5889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14519, "tokens_per_second_per_gpu": 9680.87, "total_tokens": 1433335845 }, { "epoch": 0.9077269317329333, "grad_norm": 0.8812596201896667, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14520, "tokens_per_second_per_gpu": 10852.35, "total_tokens": 1433434289 }, { "epoch": 0.9077894473618404, "grad_norm": 0.9232355952262878, "learning_rate": 2e-05, "loss": 0.5724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14521, "tokens_per_second_per_gpu": 10201.59, "total_tokens": 1433525230 }, { "epoch": 0.9078519629907477, "grad_norm": 0.8887559771537781, "learning_rate": 2e-05, "loss": 0.5828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14522, "tokens_per_second_per_gpu": 10824.72, "total_tokens": 1433623134 }, { "epoch": 0.9079144786196549, "grad_norm": 0.9055480360984802, "learning_rate": 2e-05, "loss": 0.5751, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14523, "tokens_per_second_per_gpu": 10128.29, "total_tokens": 1433717406 }, { "epoch": 0.9079769942485622, "grad_norm": 0.8761327266693115, "learning_rate": 2e-05, "loss": 0.58, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14524, "tokens_per_second_per_gpu": 9899.62, "total_tokens": 1433813537 }, { "epoch": 0.9080395098774694, "grad_norm": 0.9307430386543274, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14525, "tokens_per_second_per_gpu": 10702.47, "total_tokens": 1433909706 }, { "epoch": 0.9081020255063766, "grad_norm": 0.9101316332817078, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14526, "tokens_per_second_per_gpu": 10013.5, "total_tokens": 1434005298 }, { "epoch": 0.9081645411352838, "grad_norm": 0.9047828316688538, "learning_rate": 2e-05, "loss": 0.6561, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14527, "tokens_per_second_per_gpu": 10111.17, "total_tokens": 1434103783 }, { "epoch": 0.908227056764191, "grad_norm": 0.9117571711540222, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14528, "tokens_per_second_per_gpu": 10065.35, "total_tokens": 1434200131 }, { "epoch": 0.9082895723930983, "grad_norm": 0.8544573783874512, "learning_rate": 2e-05, "loss": 0.5672, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14529, "tokens_per_second_per_gpu": 10623.72, "total_tokens": 1434297107 }, { "epoch": 0.9083520880220055, "grad_norm": 0.8747199773788452, "learning_rate": 2e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14530, "tokens_per_second_per_gpu": 11037.23, "total_tokens": 1434395650 }, { "epoch": 0.9084146036509128, "grad_norm": 0.9049199819564819, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14531, "tokens_per_second_per_gpu": 10636.57, "total_tokens": 1434493278 }, { "epoch": 0.90847711927982, "grad_norm": 0.9245306253433228, "learning_rate": 2e-05, "loss": 0.6399, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14532, "tokens_per_second_per_gpu": 10602.69, "total_tokens": 1434589666 }, { "epoch": 0.9085396349087271, "grad_norm": 0.9085504412651062, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14533, "tokens_per_second_per_gpu": 9455.22, "total_tokens": 1434684234 }, { "epoch": 0.9086021505376344, "grad_norm": 0.892966091632843, "learning_rate": 2e-05, "loss": 0.5724, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14534, "tokens_per_second_per_gpu": 10342.72, "total_tokens": 1434776739 }, { "epoch": 0.9086646661665416, "grad_norm": 0.9219091534614563, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14535, "tokens_per_second_per_gpu": 9393.61, "total_tokens": 1434867803 }, { "epoch": 0.9087271817954489, "grad_norm": 0.9320824146270752, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14536, "tokens_per_second_per_gpu": 10117.44, "total_tokens": 1434961504 }, { "epoch": 0.9087896974243561, "grad_norm": 0.8839580416679382, "learning_rate": 2e-05, "loss": 0.5884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14537, "tokens_per_second_per_gpu": 10145.32, "total_tokens": 1435056873 }, { "epoch": 0.9088522130532634, "grad_norm": 0.9104893207550049, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14538, "tokens_per_second_per_gpu": 10442.4, "total_tokens": 1435153502 }, { "epoch": 0.9089147286821705, "grad_norm": 0.881281316280365, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14539, "tokens_per_second_per_gpu": 9844.18, "total_tokens": 1435248781 }, { "epoch": 0.9089772443110777, "grad_norm": 0.9439356327056885, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14540, "tokens_per_second_per_gpu": 9110.51, "total_tokens": 1435340945 }, { "epoch": 0.909039759939985, "grad_norm": 1.1521930694580078, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14541, "tokens_per_second_per_gpu": 9618.05, "total_tokens": 1435436589 }, { "epoch": 0.9091022755688922, "grad_norm": 0.8911756277084351, "learning_rate": 2e-05, "loss": 0.6066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14542, "tokens_per_second_per_gpu": 10999.17, "total_tokens": 1435534158 }, { "epoch": 0.9091647911977995, "grad_norm": 0.9023198485374451, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14543, "tokens_per_second_per_gpu": 10640.15, "total_tokens": 1435632290 }, { "epoch": 0.9092273068267067, "grad_norm": 0.881888747215271, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14544, "tokens_per_second_per_gpu": 10322.11, "total_tokens": 1435730207 }, { "epoch": 0.9092898224556138, "grad_norm": 0.889726459980011, "learning_rate": 2e-05, "loss": 0.5906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14545, "tokens_per_second_per_gpu": 10103.53, "total_tokens": 1435827077 }, { "epoch": 0.9093523380845211, "grad_norm": 0.9292919635772705, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14546, "tokens_per_second_per_gpu": 10785.57, "total_tokens": 1435924749 }, { "epoch": 0.9094148537134283, "grad_norm": 0.8916361331939697, "learning_rate": 2e-05, "loss": 0.604, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14547, "tokens_per_second_per_gpu": 11710.76, "total_tokens": 1436024770 }, { "epoch": 0.9094773693423356, "grad_norm": 0.9049341082572937, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14548, "tokens_per_second_per_gpu": 11268.26, "total_tokens": 1436124864 }, { "epoch": 0.9095398849712428, "grad_norm": 0.8607706427574158, "learning_rate": 2e-05, "loss": 0.583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14549, "tokens_per_second_per_gpu": 10259.53, "total_tokens": 1436224487 }, { "epoch": 0.9096024006001501, "grad_norm": 0.9311413168907166, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14550, "tokens_per_second_per_gpu": 10298.13, "total_tokens": 1436322241 }, { "epoch": 0.9096649162290573, "grad_norm": 0.8560624718666077, "learning_rate": 2e-05, "loss": 0.5845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14551, "tokens_per_second_per_gpu": 10626.01, "total_tokens": 1436423607 }, { "epoch": 0.9097274318579645, "grad_norm": 0.8752605319023132, "learning_rate": 2e-05, "loss": 0.5657, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14552, "tokens_per_second_per_gpu": 10447.4, "total_tokens": 1436520761 }, { "epoch": 0.9097899474868717, "grad_norm": 0.9239526987075806, "learning_rate": 2e-05, "loss": 0.6367, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14553, "tokens_per_second_per_gpu": 10716.09, "total_tokens": 1436622100 }, { "epoch": 0.9098524631157789, "grad_norm": 0.8750607371330261, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14554, "tokens_per_second_per_gpu": 10477.39, "total_tokens": 1436722521 }, { "epoch": 0.9099149787446862, "grad_norm": 0.8894999027252197, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14555, "tokens_per_second_per_gpu": 10024.44, "total_tokens": 1436817651 }, { "epoch": 0.9099774943735934, "grad_norm": 0.8787860870361328, "learning_rate": 2e-05, "loss": 0.5743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14556, "tokens_per_second_per_gpu": 10096.56, "total_tokens": 1436915827 }, { "epoch": 0.9100400100025007, "grad_norm": 0.8620316386222839, "learning_rate": 2e-05, "loss": 0.5787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14557, "tokens_per_second_per_gpu": 10502.48, "total_tokens": 1437012297 }, { "epoch": 0.9101025256314078, "grad_norm": 1.003522276878357, "learning_rate": 2e-05, "loss": 0.6056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14558, "tokens_per_second_per_gpu": 10687.28, "total_tokens": 1437108091 }, { "epoch": 0.910165041260315, "grad_norm": 0.888908863067627, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14559, "tokens_per_second_per_gpu": 10506.76, "total_tokens": 1437206486 }, { "epoch": 0.9102275568892223, "grad_norm": 0.8913189768791199, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14560, "tokens_per_second_per_gpu": 10398.04, "total_tokens": 1437307699 }, { "epoch": 0.9102900725181295, "grad_norm": 0.9334678649902344, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14561, "tokens_per_second_per_gpu": 10061.32, "total_tokens": 1437403573 }, { "epoch": 0.9103525881470368, "grad_norm": 0.881254255771637, "learning_rate": 2e-05, "loss": 0.594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14562, "tokens_per_second_per_gpu": 10242.2, "total_tokens": 1437499705 }, { "epoch": 0.910415103775944, "grad_norm": 0.8947475552558899, "learning_rate": 2e-05, "loss": 0.6057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14563, "tokens_per_second_per_gpu": 10716.54, "total_tokens": 1437595630 }, { "epoch": 0.9104776194048512, "grad_norm": 0.8998247981071472, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14564, "tokens_per_second_per_gpu": 10863.12, "total_tokens": 1437695019 }, { "epoch": 0.9105401350337584, "grad_norm": 0.9430014491081238, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14565, "tokens_per_second_per_gpu": 10689.34, "total_tokens": 1437793198 }, { "epoch": 0.9106026506626657, "grad_norm": 0.8501725196838379, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14566, "tokens_per_second_per_gpu": 11225.68, "total_tokens": 1437897304 }, { "epoch": 0.9106651662915729, "grad_norm": 0.9094390869140625, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14567, "tokens_per_second_per_gpu": 10482.69, "total_tokens": 1437994002 }, { "epoch": 0.9107276819204801, "grad_norm": 0.8279213905334473, "learning_rate": 2e-05, "loss": 0.5778, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14568, "tokens_per_second_per_gpu": 10660.5, "total_tokens": 1438099294 }, { "epoch": 0.9107901975493874, "grad_norm": 0.8860014081001282, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14569, "tokens_per_second_per_gpu": 10544.76, "total_tokens": 1438201683 }, { "epoch": 0.9108527131782945, "grad_norm": 0.8998523950576782, "learning_rate": 2e-05, "loss": 0.6401, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14570, "tokens_per_second_per_gpu": 11157.68, "total_tokens": 1438304987 }, { "epoch": 0.9109152288072018, "grad_norm": 0.9375467896461487, "learning_rate": 2e-05, "loss": 0.5861, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14571, "tokens_per_second_per_gpu": 10136.63, "total_tokens": 1438400857 }, { "epoch": 0.910977744436109, "grad_norm": 0.9242175817489624, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14572, "tokens_per_second_per_gpu": 10150.47, "total_tokens": 1438497042 }, { "epoch": 0.9110402600650163, "grad_norm": 0.8947046399116516, "learning_rate": 2e-05, "loss": 0.6336, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14573, "tokens_per_second_per_gpu": 10912.61, "total_tokens": 1438595023 }, { "epoch": 0.9111027756939235, "grad_norm": 0.8813335299491882, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14574, "tokens_per_second_per_gpu": 10905.19, "total_tokens": 1438694962 }, { "epoch": 0.9111652913228308, "grad_norm": 0.8691104650497437, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14575, "tokens_per_second_per_gpu": 11008.65, "total_tokens": 1438794971 }, { "epoch": 0.9112278069517379, "grad_norm": 0.8442134261131287, "learning_rate": 2e-05, "loss": 0.6232, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14576, "tokens_per_second_per_gpu": 11491.38, "total_tokens": 1438899682 }, { "epoch": 0.9112903225806451, "grad_norm": 0.9116926193237305, "learning_rate": 2e-05, "loss": 0.6297, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14577, "tokens_per_second_per_gpu": 10352.0, "total_tokens": 1438998296 }, { "epoch": 0.9113528382095524, "grad_norm": 0.8521417379379272, "learning_rate": 2e-05, "loss": 0.6656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14578, "tokens_per_second_per_gpu": 10072.52, "total_tokens": 1439098093 }, { "epoch": 0.9114153538384596, "grad_norm": 0.9308702945709229, "learning_rate": 2e-05, "loss": 0.5994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14579, "tokens_per_second_per_gpu": 9942.82, "total_tokens": 1439193552 }, { "epoch": 0.9114778694673669, "grad_norm": 0.9771367907524109, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14580, "tokens_per_second_per_gpu": 10094.03, "total_tokens": 1439287305 }, { "epoch": 0.9115403850962741, "grad_norm": 0.8784776926040649, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14581, "tokens_per_second_per_gpu": 10376.7, "total_tokens": 1439386124 }, { "epoch": 0.9116029007251812, "grad_norm": 0.9040942192077637, "learning_rate": 2e-05, "loss": 0.5841, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14582, "tokens_per_second_per_gpu": 9504.54, "total_tokens": 1439481622 }, { "epoch": 0.9116654163540885, "grad_norm": 0.8997484445571899, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14583, "tokens_per_second_per_gpu": 10029.72, "total_tokens": 1439579282 }, { "epoch": 0.9117279319829957, "grad_norm": 0.9065990447998047, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14584, "tokens_per_second_per_gpu": 10122.67, "total_tokens": 1439680045 }, { "epoch": 0.911790447611903, "grad_norm": 0.9111484289169312, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14585, "tokens_per_second_per_gpu": 10521.54, "total_tokens": 1439780465 }, { "epoch": 0.9118529632408102, "grad_norm": 0.8853350877761841, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14586, "tokens_per_second_per_gpu": 10539.09, "total_tokens": 1439882456 }, { "epoch": 0.9119154788697175, "grad_norm": 0.8885700702667236, "learning_rate": 2e-05, "loss": 0.5887, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14587, "tokens_per_second_per_gpu": 10099.57, "total_tokens": 1439977961 }, { "epoch": 0.9119779944986247, "grad_norm": 0.8691630363464355, "learning_rate": 2e-05, "loss": 0.6338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14588, "tokens_per_second_per_gpu": 10824.95, "total_tokens": 1440082850 }, { "epoch": 0.9120405101275318, "grad_norm": 0.9056453108787537, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14589, "tokens_per_second_per_gpu": 9880.31, "total_tokens": 1440177703 }, { "epoch": 0.9121030257564391, "grad_norm": 0.8915550708770752, "learning_rate": 2e-05, "loss": 0.6072, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14590, "tokens_per_second_per_gpu": 10514.55, "total_tokens": 1440276035 }, { "epoch": 0.9121655413853463, "grad_norm": 0.8950681686401367, "learning_rate": 2e-05, "loss": 0.6089, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14591, "tokens_per_second_per_gpu": 11139.94, "total_tokens": 1440377752 }, { "epoch": 0.9122280570142536, "grad_norm": 0.887303352355957, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14592, "tokens_per_second_per_gpu": 10746.57, "total_tokens": 1440475663 }, { "epoch": 0.9122905726431608, "grad_norm": 0.8748335838317871, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14593, "tokens_per_second_per_gpu": 9708.51, "total_tokens": 1440571055 }, { "epoch": 0.9123530882720681, "grad_norm": 0.8700657486915588, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14594, "tokens_per_second_per_gpu": 10550.16, "total_tokens": 1440672719 }, { "epoch": 0.9124156039009752, "grad_norm": 0.903811514377594, "learning_rate": 2e-05, "loss": 0.5869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14595, "tokens_per_second_per_gpu": 11145.7, "total_tokens": 1440768152 }, { "epoch": 0.9124781195298824, "grad_norm": 0.8655623197555542, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14596, "tokens_per_second_per_gpu": 10867.28, "total_tokens": 1440870626 }, { "epoch": 0.9125406351587897, "grad_norm": 0.8991253972053528, "learning_rate": 2e-05, "loss": 0.5699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14597, "tokens_per_second_per_gpu": 9526.93, "total_tokens": 1440962116 }, { "epoch": 0.9126031507876969, "grad_norm": 0.904284656047821, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14598, "tokens_per_second_per_gpu": 10300.83, "total_tokens": 1441061618 }, { "epoch": 0.9126656664166042, "grad_norm": 0.879393458366394, "learning_rate": 2e-05, "loss": 0.6559, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14599, "tokens_per_second_per_gpu": 10984.56, "total_tokens": 1441162171 }, { "epoch": 0.9127281820455114, "grad_norm": 0.8924060463905334, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14600, "tokens_per_second_per_gpu": 10975.82, "total_tokens": 1441264305 }, { "epoch": 0.9127906976744186, "grad_norm": 0.8747789859771729, "learning_rate": 2e-05, "loss": 0.5917, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14601, "tokens_per_second_per_gpu": 10659.11, "total_tokens": 1441362025 }, { "epoch": 0.9128532133033258, "grad_norm": 0.9063199758529663, "learning_rate": 2e-05, "loss": 0.5907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14602, "tokens_per_second_per_gpu": 10107.32, "total_tokens": 1441462072 }, { "epoch": 0.912915728932233, "grad_norm": 0.8648191094398499, "learning_rate": 2e-05, "loss": 0.5949, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14603, "tokens_per_second_per_gpu": 11452.03, "total_tokens": 1441564604 }, { "epoch": 0.9129782445611403, "grad_norm": 0.836742103099823, "learning_rate": 2e-05, "loss": 0.5945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14604, "tokens_per_second_per_gpu": 11118.83, "total_tokens": 1441668843 }, { "epoch": 0.9130407601900475, "grad_norm": 0.8980869650840759, "learning_rate": 2e-05, "loss": 0.5926, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14605, "tokens_per_second_per_gpu": 10112.16, "total_tokens": 1441764448 }, { "epoch": 0.9131032758189548, "grad_norm": 0.8856457471847534, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14606, "tokens_per_second_per_gpu": 10876.44, "total_tokens": 1441866069 }, { "epoch": 0.9131657914478619, "grad_norm": 0.9037086963653564, "learning_rate": 2e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14607, "tokens_per_second_per_gpu": 9949.21, "total_tokens": 1441961200 }, { "epoch": 0.9132283070767692, "grad_norm": 0.904596209526062, "learning_rate": 2e-05, "loss": 0.6358, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14608, "tokens_per_second_per_gpu": 10686.12, "total_tokens": 1442059149 }, { "epoch": 0.9132908227056764, "grad_norm": 0.884976327419281, "learning_rate": 2e-05, "loss": 0.6415, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14609, "tokens_per_second_per_gpu": 11090.97, "total_tokens": 1442160176 }, { "epoch": 0.9133533383345837, "grad_norm": 0.8848055005073547, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14610, "tokens_per_second_per_gpu": 10118.75, "total_tokens": 1442258664 }, { "epoch": 0.9134158539634909, "grad_norm": 0.876341700553894, "learning_rate": 2e-05, "loss": 0.6532, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14611, "tokens_per_second_per_gpu": 10528.86, "total_tokens": 1442359056 }, { "epoch": 0.9134783695923981, "grad_norm": 0.896753191947937, "learning_rate": 2e-05, "loss": 0.6863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14612, "tokens_per_second_per_gpu": 10794.07, "total_tokens": 1442459066 }, { "epoch": 0.9135408852213053, "grad_norm": 0.9002343416213989, "learning_rate": 2e-05, "loss": 0.5992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14613, "tokens_per_second_per_gpu": 10682.31, "total_tokens": 1442557459 }, { "epoch": 0.9136034008502125, "grad_norm": 0.908191442489624, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14614, "tokens_per_second_per_gpu": 11407.56, "total_tokens": 1442661993 }, { "epoch": 0.9136659164791198, "grad_norm": 0.8920986652374268, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14615, "tokens_per_second_per_gpu": 10631.96, "total_tokens": 1442758625 }, { "epoch": 0.913728432108027, "grad_norm": 0.9425325393676758, "learning_rate": 2e-05, "loss": 0.5984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14616, "tokens_per_second_per_gpu": 10517.65, "total_tokens": 1442860888 }, { "epoch": 0.9137909477369343, "grad_norm": 0.8679282665252686, "learning_rate": 2e-05, "loss": 0.5643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14617, "tokens_per_second_per_gpu": 9805.49, "total_tokens": 1442957116 }, { "epoch": 0.9138534633658415, "grad_norm": 0.9061049222946167, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14618, "tokens_per_second_per_gpu": 9786.93, "total_tokens": 1443053056 }, { "epoch": 0.9139159789947486, "grad_norm": 0.8696503043174744, "learning_rate": 2e-05, "loss": 0.6019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14619, "tokens_per_second_per_gpu": 10835.76, "total_tokens": 1443155376 }, { "epoch": 0.9139784946236559, "grad_norm": 0.9169064164161682, "learning_rate": 2e-05, "loss": 0.6065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14620, "tokens_per_second_per_gpu": 9757.8, "total_tokens": 1443249284 }, { "epoch": 0.9140410102525631, "grad_norm": 0.9245966672897339, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14621, "tokens_per_second_per_gpu": 9983.61, "total_tokens": 1443345983 }, { "epoch": 0.9141035258814704, "grad_norm": 0.8971836566925049, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14622, "tokens_per_second_per_gpu": 10050.32, "total_tokens": 1443443312 }, { "epoch": 0.9141660415103776, "grad_norm": 0.9034493565559387, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14623, "tokens_per_second_per_gpu": 10784.65, "total_tokens": 1443543296 }, { "epoch": 0.9142285571392849, "grad_norm": 0.8608485460281372, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14624, "tokens_per_second_per_gpu": 10654.89, "total_tokens": 1443644677 }, { "epoch": 0.9142910727681921, "grad_norm": 0.8939640522003174, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14625, "tokens_per_second_per_gpu": 10689.51, "total_tokens": 1443747509 }, { "epoch": 0.9143535883970992, "grad_norm": 0.845161497592926, "learning_rate": 2e-05, "loss": 0.5734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14626, "tokens_per_second_per_gpu": 11136.18, "total_tokens": 1443850210 }, { "epoch": 0.9144161040260065, "grad_norm": 0.9096889495849609, "learning_rate": 2e-05, "loss": 0.6306, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14627, "tokens_per_second_per_gpu": 11563.2, "total_tokens": 1443954859 }, { "epoch": 0.9144786196549137, "grad_norm": 0.8797011971473694, "learning_rate": 2e-05, "loss": 0.5748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14628, "tokens_per_second_per_gpu": 10803.21, "total_tokens": 1444053921 }, { "epoch": 0.914541135283821, "grad_norm": 0.8531519770622253, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14629, "tokens_per_second_per_gpu": 11178.89, "total_tokens": 1444157516 }, { "epoch": 0.9146036509127282, "grad_norm": 0.8937023282051086, "learning_rate": 2e-05, "loss": 0.5681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14630, "tokens_per_second_per_gpu": 10172.06, "total_tokens": 1444252742 }, { "epoch": 0.9146661665416355, "grad_norm": 0.9451360702514648, "learning_rate": 2e-05, "loss": 0.673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14631, "tokens_per_second_per_gpu": 10232.4, "total_tokens": 1444349191 }, { "epoch": 0.9147286821705426, "grad_norm": 0.8745190501213074, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14632, "tokens_per_second_per_gpu": 10387.32, "total_tokens": 1444449822 }, { "epoch": 0.9147911977994498, "grad_norm": 0.9029650688171387, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14633, "tokens_per_second_per_gpu": 10446.89, "total_tokens": 1444547908 }, { "epoch": 0.9148537134283571, "grad_norm": 0.8739587664604187, "learning_rate": 2e-05, "loss": 0.5572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14634, "tokens_per_second_per_gpu": 11395.16, "total_tokens": 1444644253 }, { "epoch": 0.9149162290572643, "grad_norm": 0.9027525186538696, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14635, "tokens_per_second_per_gpu": 10335.79, "total_tokens": 1444741157 }, { "epoch": 0.9149787446861716, "grad_norm": 0.8705416917800903, "learning_rate": 2e-05, "loss": 0.6059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14636, "tokens_per_second_per_gpu": 10472.8, "total_tokens": 1444843564 }, { "epoch": 0.9150412603150788, "grad_norm": 0.886750340461731, "learning_rate": 2e-05, "loss": 0.6118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14637, "tokens_per_second_per_gpu": 10837.81, "total_tokens": 1444944220 }, { "epoch": 0.915103775943986, "grad_norm": 0.887239396572113, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14638, "tokens_per_second_per_gpu": 11226.09, "total_tokens": 1445047664 }, { "epoch": 0.9151662915728932, "grad_norm": 0.912031352519989, "learning_rate": 2e-05, "loss": 0.6142, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14639, "tokens_per_second_per_gpu": 10530.72, "total_tokens": 1445147354 }, { "epoch": 0.9152288072018004, "grad_norm": 0.8524512648582458, "learning_rate": 2e-05, "loss": 0.576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14640, "tokens_per_second_per_gpu": 10410.02, "total_tokens": 1445243609 }, { "epoch": 0.9152913228307077, "grad_norm": 0.8880380392074585, "learning_rate": 2e-05, "loss": 0.6786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14641, "tokens_per_second_per_gpu": 10660.36, "total_tokens": 1445345320 }, { "epoch": 0.9153538384596149, "grad_norm": 0.881358802318573, "learning_rate": 2e-05, "loss": 0.5916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14642, "tokens_per_second_per_gpu": 9911.49, "total_tokens": 1445441680 }, { "epoch": 0.9154163540885222, "grad_norm": 0.8942898511886597, "learning_rate": 2e-05, "loss": 0.6105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14643, "tokens_per_second_per_gpu": 10152.87, "total_tokens": 1445536885 }, { "epoch": 0.9154788697174293, "grad_norm": 0.920046865940094, "learning_rate": 2e-05, "loss": 0.5988, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14644, "tokens_per_second_per_gpu": 9761.59, "total_tokens": 1445632697 }, { "epoch": 0.9155413853463366, "grad_norm": 0.919398307800293, "learning_rate": 2e-05, "loss": 0.6705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14645, "tokens_per_second_per_gpu": 10424.84, "total_tokens": 1445733475 }, { "epoch": 0.9156039009752438, "grad_norm": 0.8953824043273926, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14646, "tokens_per_second_per_gpu": 10757.17, "total_tokens": 1445836860 }, { "epoch": 0.915666416604151, "grad_norm": 0.9129613041877747, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14647, "tokens_per_second_per_gpu": 10747.27, "total_tokens": 1445936106 }, { "epoch": 0.9157289322330583, "grad_norm": 0.9256171584129333, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14648, "tokens_per_second_per_gpu": 10155.44, "total_tokens": 1446028556 }, { "epoch": 0.9157914478619655, "grad_norm": 0.952166736125946, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14649, "tokens_per_second_per_gpu": 10592.21, "total_tokens": 1446124320 }, { "epoch": 0.9158539634908727, "grad_norm": 0.9760528802871704, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14650, "tokens_per_second_per_gpu": 10679.57, "total_tokens": 1446225168 }, { "epoch": 0.9159164791197799, "grad_norm": 0.8889740705490112, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14651, "tokens_per_second_per_gpu": 10476.46, "total_tokens": 1446324509 }, { "epoch": 0.9159789947486872, "grad_norm": 0.8733773231506348, "learning_rate": 2e-05, "loss": 0.5628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14652, "tokens_per_second_per_gpu": 10483.17, "total_tokens": 1446421141 }, { "epoch": 0.9160415103775944, "grad_norm": 0.890484094619751, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14653, "tokens_per_second_per_gpu": 10366.37, "total_tokens": 1446520173 }, { "epoch": 0.9161040260065016, "grad_norm": 0.8963049650192261, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14654, "tokens_per_second_per_gpu": 10601.04, "total_tokens": 1446622067 }, { "epoch": 0.9161665416354089, "grad_norm": 0.9590346813201904, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14655, "tokens_per_second_per_gpu": 10177.92, "total_tokens": 1446717462 }, { "epoch": 0.916229057264316, "grad_norm": 0.9200029373168945, "learning_rate": 2e-05, "loss": 0.632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14656, "tokens_per_second_per_gpu": 10886.42, "total_tokens": 1446819156 }, { "epoch": 0.9162915728932233, "grad_norm": 0.888097882270813, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14657, "tokens_per_second_per_gpu": 10949.76, "total_tokens": 1446918550 }, { "epoch": 0.9163540885221305, "grad_norm": 0.8911339640617371, "learning_rate": 2e-05, "loss": 0.611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14658, "tokens_per_second_per_gpu": 11239.3, "total_tokens": 1447019146 }, { "epoch": 0.9164166041510378, "grad_norm": 0.919937014579773, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14659, "tokens_per_second_per_gpu": 10399.7, "total_tokens": 1447118681 }, { "epoch": 0.916479119779945, "grad_norm": 0.9061667323112488, "learning_rate": 2e-05, "loss": 0.6208, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14660, "tokens_per_second_per_gpu": 10185.36, "total_tokens": 1447219284 }, { "epoch": 0.9165416354088523, "grad_norm": 0.8910291790962219, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14661, "tokens_per_second_per_gpu": 10076.43, "total_tokens": 1447315594 }, { "epoch": 0.9166041510377594, "grad_norm": 0.9092119336128235, "learning_rate": 2e-05, "loss": 0.619, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14662, "tokens_per_second_per_gpu": 10886.12, "total_tokens": 1447414327 }, { "epoch": 0.9166666666666666, "grad_norm": 0.9123079180717468, "learning_rate": 2e-05, "loss": 0.5941, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14663, "tokens_per_second_per_gpu": 9872.94, "total_tokens": 1447512048 }, { "epoch": 0.9167291822955739, "grad_norm": 0.8833547234535217, "learning_rate": 2e-05, "loss": 0.5799, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14664, "tokens_per_second_per_gpu": 10360.46, "total_tokens": 1447609947 }, { "epoch": 0.9167916979244811, "grad_norm": 0.9377753138542175, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14665, "tokens_per_second_per_gpu": 10683.74, "total_tokens": 1447707884 }, { "epoch": 0.9168542135533884, "grad_norm": 0.9104425311088562, "learning_rate": 2e-05, "loss": 0.5925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14666, "tokens_per_second_per_gpu": 9864.71, "total_tokens": 1447803202 }, { "epoch": 0.9169167291822956, "grad_norm": 0.9124295115470886, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14667, "tokens_per_second_per_gpu": 10699.28, "total_tokens": 1447902768 }, { "epoch": 0.9169792448112029, "grad_norm": 0.9121386408805847, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14668, "tokens_per_second_per_gpu": 10275.48, "total_tokens": 1447999742 }, { "epoch": 0.91704176044011, "grad_norm": 0.857532262802124, "learning_rate": 2e-05, "loss": 0.5963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14669, "tokens_per_second_per_gpu": 10401.59, "total_tokens": 1448101605 }, { "epoch": 0.9171042760690172, "grad_norm": 0.928098738193512, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14670, "tokens_per_second_per_gpu": 10409.82, "total_tokens": 1448201547 }, { "epoch": 0.9171667916979245, "grad_norm": 0.8832036852836609, "learning_rate": 2e-05, "loss": 0.6305, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14671, "tokens_per_second_per_gpu": 10137.06, "total_tokens": 1448299579 }, { "epoch": 0.9172293073268317, "grad_norm": 0.8668675422668457, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14672, "tokens_per_second_per_gpu": 10539.73, "total_tokens": 1448399005 }, { "epoch": 0.917291822955739, "grad_norm": 0.9075150489807129, "learning_rate": 2e-05, "loss": 0.5925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14673, "tokens_per_second_per_gpu": 10256.47, "total_tokens": 1448496739 }, { "epoch": 0.9173543385846462, "grad_norm": 0.9403502345085144, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14674, "tokens_per_second_per_gpu": 10278.48, "total_tokens": 1448595516 }, { "epoch": 0.9174168542135533, "grad_norm": 0.8953685760498047, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14675, "tokens_per_second_per_gpu": 10897.27, "total_tokens": 1448698273 }, { "epoch": 0.9174793698424606, "grad_norm": 0.9076838493347168, "learning_rate": 2e-05, "loss": 0.6586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14676, "tokens_per_second_per_gpu": 10954.41, "total_tokens": 1448799272 }, { "epoch": 0.9175418854713678, "grad_norm": 0.8806486129760742, "learning_rate": 2e-05, "loss": 0.5779, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14677, "tokens_per_second_per_gpu": 10129.42, "total_tokens": 1448895217 }, { "epoch": 0.9176044011002751, "grad_norm": 0.8838737607002258, "learning_rate": 2e-05, "loss": 0.5956, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14678, "tokens_per_second_per_gpu": 9817.19, "total_tokens": 1448991771 }, { "epoch": 0.9176669167291823, "grad_norm": 0.9020230770111084, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14679, "tokens_per_second_per_gpu": 10037.89, "total_tokens": 1449090311 }, { "epoch": 0.9177294323580896, "grad_norm": 0.8918625712394714, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14680, "tokens_per_second_per_gpu": 10518.44, "total_tokens": 1449188634 }, { "epoch": 0.9177919479869967, "grad_norm": 0.9344570636749268, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14681, "tokens_per_second_per_gpu": 10432.77, "total_tokens": 1449285625 }, { "epoch": 0.917854463615904, "grad_norm": 0.900397777557373, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14682, "tokens_per_second_per_gpu": 10817.61, "total_tokens": 1449386676 }, { "epoch": 0.9179169792448112, "grad_norm": 0.8744664192199707, "learning_rate": 2e-05, "loss": 0.5981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14683, "tokens_per_second_per_gpu": 10850.44, "total_tokens": 1449483824 }, { "epoch": 0.9179794948737184, "grad_norm": 0.8613439798355103, "learning_rate": 2e-05, "loss": 0.5742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14684, "tokens_per_second_per_gpu": 10489.27, "total_tokens": 1449584543 }, { "epoch": 0.9180420105026257, "grad_norm": 0.9208601117134094, "learning_rate": 2e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14685, "tokens_per_second_per_gpu": 10933.88, "total_tokens": 1449686445 }, { "epoch": 0.9181045261315329, "grad_norm": 0.9129777550697327, "learning_rate": 2e-05, "loss": 0.5859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14686, "tokens_per_second_per_gpu": 10099.55, "total_tokens": 1449783252 }, { "epoch": 0.9181670417604401, "grad_norm": 0.9156306982040405, "learning_rate": 2e-05, "loss": 0.6019, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14687, "tokens_per_second_per_gpu": 10997.75, "total_tokens": 1449881930 }, { "epoch": 0.9182295573893473, "grad_norm": 0.9378200769424438, "learning_rate": 2e-05, "loss": 0.6091, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14688, "tokens_per_second_per_gpu": 11241.92, "total_tokens": 1449978770 }, { "epoch": 0.9182920730182546, "grad_norm": 0.911342203617096, "learning_rate": 2e-05, "loss": 0.6774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14689, "tokens_per_second_per_gpu": 11973.71, "total_tokens": 1450078988 }, { "epoch": 0.9183545886471618, "grad_norm": 0.8999727368354797, "learning_rate": 2e-05, "loss": 0.6154, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14690, "tokens_per_second_per_gpu": 10477.45, "total_tokens": 1450178430 }, { "epoch": 0.918417104276069, "grad_norm": 0.8682667016983032, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14691, "tokens_per_second_per_gpu": 11333.63, "total_tokens": 1450285303 }, { "epoch": 0.9184796199049763, "grad_norm": 0.9165598154067993, "learning_rate": 2e-05, "loss": 0.6458, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14692, "tokens_per_second_per_gpu": 10079.91, "total_tokens": 1450383153 }, { "epoch": 0.9185421355338834, "grad_norm": 0.8953408598899841, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14693, "tokens_per_second_per_gpu": 10159.36, "total_tokens": 1450477479 }, { "epoch": 0.9186046511627907, "grad_norm": 0.9312976002693176, "learning_rate": 2e-05, "loss": 0.5797, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14694, "tokens_per_second_per_gpu": 11272.2, "total_tokens": 1450577204 }, { "epoch": 0.9186671667916979, "grad_norm": 0.8919880390167236, "learning_rate": 2e-05, "loss": 0.5988, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14695, "tokens_per_second_per_gpu": 10409.17, "total_tokens": 1450674135 }, { "epoch": 0.9187296824206052, "grad_norm": 0.9087428450584412, "learning_rate": 2e-05, "loss": 0.6695, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14696, "tokens_per_second_per_gpu": 10890.77, "total_tokens": 1450774470 }, { "epoch": 0.9187921980495124, "grad_norm": 1.2426528930664062, "learning_rate": 2e-05, "loss": 0.6404, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14697, "tokens_per_second_per_gpu": 10150.94, "total_tokens": 1450873478 }, { "epoch": 0.9188547136784196, "grad_norm": 0.8763628005981445, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14698, "tokens_per_second_per_gpu": 11098.35, "total_tokens": 1450973190 }, { "epoch": 0.9189172293073268, "grad_norm": 0.9004682898521423, "learning_rate": 2e-05, "loss": 0.5765, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14699, "tokens_per_second_per_gpu": 9886.61, "total_tokens": 1451067815 }, { "epoch": 0.918979744936234, "grad_norm": 1.0014170408248901, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14700, "tokens_per_second_per_gpu": 10092.8, "total_tokens": 1451167498 }, { "epoch": 0.9190422605651413, "grad_norm": 0.8802646994590759, "learning_rate": 2e-05, "loss": 0.6352, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14701, "tokens_per_second_per_gpu": 10969.66, "total_tokens": 1451268412 }, { "epoch": 0.9191047761940485, "grad_norm": 0.8546357750892639, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14702, "tokens_per_second_per_gpu": 11222.31, "total_tokens": 1451368705 }, { "epoch": 0.9191672918229558, "grad_norm": 0.8856434226036072, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14703, "tokens_per_second_per_gpu": 10171.57, "total_tokens": 1451466507 }, { "epoch": 0.919229807451863, "grad_norm": 0.918898344039917, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14704, "tokens_per_second_per_gpu": 10576.73, "total_tokens": 1451568409 }, { "epoch": 0.9192923230807702, "grad_norm": 0.9225409030914307, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14705, "tokens_per_second_per_gpu": 10568.36, "total_tokens": 1451669507 }, { "epoch": 0.9193548387096774, "grad_norm": 0.953831672668457, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14706, "tokens_per_second_per_gpu": 10468.28, "total_tokens": 1451768692 }, { "epoch": 0.9194173543385846, "grad_norm": 0.901591420173645, "learning_rate": 2e-05, "loss": 0.622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14707, "tokens_per_second_per_gpu": 10613.84, "total_tokens": 1451864601 }, { "epoch": 0.9194798699674919, "grad_norm": 0.867977499961853, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14708, "tokens_per_second_per_gpu": 10508.42, "total_tokens": 1451963946 }, { "epoch": 0.9195423855963991, "grad_norm": 0.924267590045929, "learning_rate": 2e-05, "loss": 0.5793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14709, "tokens_per_second_per_gpu": 10054.65, "total_tokens": 1452057991 }, { "epoch": 0.9196049012253064, "grad_norm": 0.8998110294342041, "learning_rate": 2e-05, "loss": 0.5994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14710, "tokens_per_second_per_gpu": 10743.94, "total_tokens": 1452155776 }, { "epoch": 0.9196674168542136, "grad_norm": 0.9902384281158447, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14711, "tokens_per_second_per_gpu": 10187.55, "total_tokens": 1452253173 }, { "epoch": 0.9197299324831207, "grad_norm": 0.8818157911300659, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14712, "tokens_per_second_per_gpu": 10869.28, "total_tokens": 1452354336 }, { "epoch": 0.919792448112028, "grad_norm": 0.9008480310440063, "learning_rate": 2e-05, "loss": 0.6612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14713, "tokens_per_second_per_gpu": 10980.23, "total_tokens": 1452459469 }, { "epoch": 0.9198549637409352, "grad_norm": 0.9010132551193237, "learning_rate": 2e-05, "loss": 0.6334, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14714, "tokens_per_second_per_gpu": 10947.79, "total_tokens": 1452561843 }, { "epoch": 0.9199174793698425, "grad_norm": 0.847470223903656, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14715, "tokens_per_second_per_gpu": 11071.96, "total_tokens": 1452663733 }, { "epoch": 0.9199799949987497, "grad_norm": 0.9175447821617126, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14716, "tokens_per_second_per_gpu": 10183.93, "total_tokens": 1452764963 }, { "epoch": 0.920042510627657, "grad_norm": 0.905568540096283, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14717, "tokens_per_second_per_gpu": 11217.09, "total_tokens": 1452868187 }, { "epoch": 0.9201050262565641, "grad_norm": 0.8732720017433167, "learning_rate": 2e-05, "loss": 0.6289, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14718, "tokens_per_second_per_gpu": 11011.29, "total_tokens": 1452967313 }, { "epoch": 0.9201675418854713, "grad_norm": 0.9055741429328918, "learning_rate": 2e-05, "loss": 0.6531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14719, "tokens_per_second_per_gpu": 11130.89, "total_tokens": 1453067927 }, { "epoch": 0.9202300575143786, "grad_norm": 0.9599918127059937, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14720, "tokens_per_second_per_gpu": 10654.39, "total_tokens": 1453167301 }, { "epoch": 0.9202925731432858, "grad_norm": 0.9206023216247559, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14721, "tokens_per_second_per_gpu": 11339.22, "total_tokens": 1453267788 }, { "epoch": 0.9203550887721931, "grad_norm": 0.8932795524597168, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14722, "tokens_per_second_per_gpu": 10200.42, "total_tokens": 1453367466 }, { "epoch": 0.9204176044011003, "grad_norm": 0.8963953852653503, "learning_rate": 2e-05, "loss": 0.6041, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14723, "tokens_per_second_per_gpu": 11083.16, "total_tokens": 1453467771 }, { "epoch": 0.9204801200300075, "grad_norm": 0.912639319896698, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14724, "tokens_per_second_per_gpu": 10344.79, "total_tokens": 1453567015 }, { "epoch": 0.9205426356589147, "grad_norm": 0.9280818104743958, "learning_rate": 2e-05, "loss": 0.6491, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14725, "tokens_per_second_per_gpu": 10660.53, "total_tokens": 1453668686 }, { "epoch": 0.9206051512878219, "grad_norm": 0.9315626621246338, "learning_rate": 2e-05, "loss": 0.6499, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14726, "tokens_per_second_per_gpu": 11011.17, "total_tokens": 1453768811 }, { "epoch": 0.9206676669167292, "grad_norm": 1.103875756263733, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14727, "tokens_per_second_per_gpu": 10723.18, "total_tokens": 1453867177 }, { "epoch": 0.9207301825456364, "grad_norm": 0.8608686327934265, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14728, "tokens_per_second_per_gpu": 10616.23, "total_tokens": 1453968374 }, { "epoch": 0.9207926981745437, "grad_norm": 0.9230279326438904, "learning_rate": 2e-05, "loss": 0.6312, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14729, "tokens_per_second_per_gpu": 10570.06, "total_tokens": 1454067467 }, { "epoch": 0.9208552138034508, "grad_norm": 0.8566730618476868, "learning_rate": 2e-05, "loss": 0.5898, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14730, "tokens_per_second_per_gpu": 10563.82, "total_tokens": 1454168680 }, { "epoch": 0.9209177294323581, "grad_norm": 0.8638827800750732, "learning_rate": 2e-05, "loss": 0.5855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14731, "tokens_per_second_per_gpu": 11374.24, "total_tokens": 1454269087 }, { "epoch": 0.9209802450612653, "grad_norm": 0.8943178057670593, "learning_rate": 2e-05, "loss": 0.6392, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14732, "tokens_per_second_per_gpu": 10627.36, "total_tokens": 1454369382 }, { "epoch": 0.9210427606901725, "grad_norm": 1.0130674839019775, "learning_rate": 2e-05, "loss": 0.6277, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14733, "tokens_per_second_per_gpu": 9479.13, "total_tokens": 1454466325 }, { "epoch": 0.9211052763190798, "grad_norm": 0.9060347676277161, "learning_rate": 2e-05, "loss": 0.6379, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14734, "tokens_per_second_per_gpu": 9957.63, "total_tokens": 1454565688 }, { "epoch": 0.921167791947987, "grad_norm": 0.9178869724273682, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14735, "tokens_per_second_per_gpu": 10921.48, "total_tokens": 1454662053 }, { "epoch": 0.9212303075768942, "grad_norm": 0.9054402112960815, "learning_rate": 2e-05, "loss": 0.6153, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14736, "tokens_per_second_per_gpu": 11498.04, "total_tokens": 1454761970 }, { "epoch": 0.9212928232058014, "grad_norm": 0.8757187128067017, "learning_rate": 2e-05, "loss": 0.6037, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14737, "tokens_per_second_per_gpu": 11368.84, "total_tokens": 1454861191 }, { "epoch": 0.9213553388347087, "grad_norm": 0.8868576884269714, "learning_rate": 2e-05, "loss": 0.5783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14738, "tokens_per_second_per_gpu": 10688.68, "total_tokens": 1454962627 }, { "epoch": 0.9214178544636159, "grad_norm": 0.8936610817909241, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14739, "tokens_per_second_per_gpu": 11587.9, "total_tokens": 1455068682 }, { "epoch": 0.9214803700925231, "grad_norm": 0.8532912731170654, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14740, "tokens_per_second_per_gpu": 10921.75, "total_tokens": 1455172560 }, { "epoch": 0.9215428857214304, "grad_norm": 0.899604856967926, "learning_rate": 2e-05, "loss": 0.6338, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14741, "tokens_per_second_per_gpu": 10251.29, "total_tokens": 1455272532 }, { "epoch": 0.9216054013503376, "grad_norm": 0.9302284717559814, "learning_rate": 2e-05, "loss": 0.5771, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14742, "tokens_per_second_per_gpu": 10089.64, "total_tokens": 1455367919 }, { "epoch": 0.9216679169792448, "grad_norm": 0.9021249413490295, "learning_rate": 2e-05, "loss": 0.597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14743, "tokens_per_second_per_gpu": 9585.88, "total_tokens": 1455463588 }, { "epoch": 0.921730432608152, "grad_norm": 0.887612521648407, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14744, "tokens_per_second_per_gpu": 10583.56, "total_tokens": 1455561566 }, { "epoch": 0.9217929482370593, "grad_norm": 0.9191667437553406, "learning_rate": 2e-05, "loss": 0.6643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14745, "tokens_per_second_per_gpu": 11219.92, "total_tokens": 1455662706 }, { "epoch": 0.9218554638659665, "grad_norm": 0.9264563918113708, "learning_rate": 2e-05, "loss": 0.6263, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14746, "tokens_per_second_per_gpu": 10963.29, "total_tokens": 1455762569 }, { "epoch": 0.9219179794948738, "grad_norm": 0.9130547642707825, "learning_rate": 2e-05, "loss": 0.6658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14747, "tokens_per_second_per_gpu": 11187.78, "total_tokens": 1455866368 }, { "epoch": 0.921980495123781, "grad_norm": 0.8876446485519409, "learning_rate": 2e-05, "loss": 0.6592, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14748, "tokens_per_second_per_gpu": 10238.13, "total_tokens": 1455966959 }, { "epoch": 0.9220430107526881, "grad_norm": 0.89349764585495, "learning_rate": 2e-05, "loss": 0.6856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14749, "tokens_per_second_per_gpu": 10895.85, "total_tokens": 1456068754 }, { "epoch": 0.9221055263815954, "grad_norm": 0.8939225077629089, "learning_rate": 2e-05, "loss": 0.6012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14750, "tokens_per_second_per_gpu": 13440.44, "total_tokens": 1456168212 }, { "epoch": 0.9221680420105026, "grad_norm": 0.9011453986167908, "learning_rate": 2e-05, "loss": 0.6345, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14751, "tokens_per_second_per_gpu": 11174.01, "total_tokens": 1456266954 }, { "epoch": 0.9222305576394099, "grad_norm": 0.9281036257743835, "learning_rate": 2e-05, "loss": 0.6448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14752, "tokens_per_second_per_gpu": 10521.26, "total_tokens": 1456366179 }, { "epoch": 0.9222930732683171, "grad_norm": 0.8902789950370789, "learning_rate": 2e-05, "loss": 0.581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14753, "tokens_per_second_per_gpu": 10468.96, "total_tokens": 1456460831 }, { "epoch": 0.9223555888972244, "grad_norm": 0.8730211853981018, "learning_rate": 2e-05, "loss": 0.5689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14754, "tokens_per_second_per_gpu": 10209.53, "total_tokens": 1456557239 }, { "epoch": 0.9224181045261315, "grad_norm": 1.0640956163406372, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14755, "tokens_per_second_per_gpu": 10142.06, "total_tokens": 1456652517 }, { "epoch": 0.9224806201550387, "grad_norm": 0.9521348476409912, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14756, "tokens_per_second_per_gpu": 10405.14, "total_tokens": 1456751131 }, { "epoch": 0.922543135783946, "grad_norm": 0.9193359613418579, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14757, "tokens_per_second_per_gpu": 10551.77, "total_tokens": 1456851980 }, { "epoch": 0.9226056514128532, "grad_norm": 0.9629909992218018, "learning_rate": 2e-05, "loss": 0.5855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14758, "tokens_per_second_per_gpu": 8967.94, "total_tokens": 1456945422 }, { "epoch": 0.9226681670417605, "grad_norm": 0.9028028249740601, "learning_rate": 2e-05, "loss": 0.6321, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14759, "tokens_per_second_per_gpu": 11139.87, "total_tokens": 1457047756 }, { "epoch": 0.9227306826706677, "grad_norm": 0.9223424792289734, "learning_rate": 2e-05, "loss": 0.6391, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14760, "tokens_per_second_per_gpu": 10392.52, "total_tokens": 1457149789 }, { "epoch": 0.9227931982995748, "grad_norm": 0.904004693031311, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14761, "tokens_per_second_per_gpu": 10220.83, "total_tokens": 1457247862 }, { "epoch": 0.9228557139284821, "grad_norm": 0.9388886094093323, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14762, "tokens_per_second_per_gpu": 10501.54, "total_tokens": 1457347752 }, { "epoch": 0.9229182295573893, "grad_norm": 0.8838473558425903, "learning_rate": 2e-05, "loss": 0.6237, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14763, "tokens_per_second_per_gpu": 11150.7, "total_tokens": 1457447499 }, { "epoch": 0.9229807451862966, "grad_norm": 0.9391043186187744, "learning_rate": 2e-05, "loss": 0.5915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14764, "tokens_per_second_per_gpu": 10531.83, "total_tokens": 1457546104 }, { "epoch": 0.9230432608152038, "grad_norm": 0.9262152314186096, "learning_rate": 2e-05, "loss": 0.6356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14765, "tokens_per_second_per_gpu": 10652.26, "total_tokens": 1457645731 }, { "epoch": 0.9231057764441111, "grad_norm": 0.8832862377166748, "learning_rate": 2e-05, "loss": 0.5808, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14766, "tokens_per_second_per_gpu": 9754.33, "total_tokens": 1457742007 }, { "epoch": 0.9231682920730182, "grad_norm": 0.8750807642936707, "learning_rate": 2e-05, "loss": 0.6202, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14767, "tokens_per_second_per_gpu": 10725.75, "total_tokens": 1457839418 }, { "epoch": 0.9232308077019254, "grad_norm": 0.9000819325447083, "learning_rate": 2e-05, "loss": 0.5786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14768, "tokens_per_second_per_gpu": 8680.78, "total_tokens": 1457930343 }, { "epoch": 0.9232933233308327, "grad_norm": 0.8904009461402893, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14769, "tokens_per_second_per_gpu": 10634.33, "total_tokens": 1458028785 }, { "epoch": 0.9233558389597399, "grad_norm": 0.8901941180229187, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14770, "tokens_per_second_per_gpu": 10805.03, "total_tokens": 1458126133 }, { "epoch": 0.9234183545886472, "grad_norm": 0.8979316353797913, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14771, "tokens_per_second_per_gpu": 11086.54, "total_tokens": 1458225660 }, { "epoch": 0.9234808702175544, "grad_norm": 0.922035813331604, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14772, "tokens_per_second_per_gpu": 9955.21, "total_tokens": 1458320808 }, { "epoch": 0.9235433858464616, "grad_norm": 0.9107963442802429, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14773, "tokens_per_second_per_gpu": 10970.38, "total_tokens": 1458421043 }, { "epoch": 0.9236059014753688, "grad_norm": 0.8804155588150024, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14774, "tokens_per_second_per_gpu": 10711.53, "total_tokens": 1458524181 }, { "epoch": 0.923668417104276, "grad_norm": 0.9253988265991211, "learning_rate": 2e-05, "loss": 0.6521, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14775, "tokens_per_second_per_gpu": 10930.83, "total_tokens": 1458622301 }, { "epoch": 0.9237309327331833, "grad_norm": 0.9243582487106323, "learning_rate": 2e-05, "loss": 0.6269, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14776, "tokens_per_second_per_gpu": 10611.45, "total_tokens": 1458719484 }, { "epoch": 0.9237934483620905, "grad_norm": 0.9199414849281311, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14777, "tokens_per_second_per_gpu": 10181.85, "total_tokens": 1458817825 }, { "epoch": 0.9238559639909978, "grad_norm": 0.8802140355110168, "learning_rate": 2e-05, "loss": 0.5575, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14778, "tokens_per_second_per_gpu": 10076.64, "total_tokens": 1458912281 }, { "epoch": 0.923918479619905, "grad_norm": 0.8874558210372925, "learning_rate": 2e-05, "loss": 0.6056, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14779, "tokens_per_second_per_gpu": 9941.92, "total_tokens": 1459007222 }, { "epoch": 0.9239809952488122, "grad_norm": 0.866880476474762, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14780, "tokens_per_second_per_gpu": 11516.68, "total_tokens": 1459111966 }, { "epoch": 0.9240435108777194, "grad_norm": 0.8997888565063477, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14781, "tokens_per_second_per_gpu": 10821.57, "total_tokens": 1459207130 }, { "epoch": 0.9241060265066267, "grad_norm": 0.8587899208068848, "learning_rate": 2e-05, "loss": 0.5855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14782, "tokens_per_second_per_gpu": 10654.91, "total_tokens": 1459309295 }, { "epoch": 0.9241685421355339, "grad_norm": 0.8992146253585815, "learning_rate": 2e-05, "loss": 0.641, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14783, "tokens_per_second_per_gpu": 10543.17, "total_tokens": 1459411531 }, { "epoch": 0.9242310577644411, "grad_norm": 0.9364867806434631, "learning_rate": 2e-05, "loss": 0.5846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14784, "tokens_per_second_per_gpu": 9847.03, "total_tokens": 1459499429 }, { "epoch": 0.9242935733933484, "grad_norm": 0.9030250906944275, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14785, "tokens_per_second_per_gpu": 10005.48, "total_tokens": 1459595257 }, { "epoch": 0.9243560890222555, "grad_norm": 0.8924476504325867, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14786, "tokens_per_second_per_gpu": 9963.36, "total_tokens": 1459692370 }, { "epoch": 0.9244186046511628, "grad_norm": 0.8653920888900757, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14787, "tokens_per_second_per_gpu": 10158.97, "total_tokens": 1459792113 }, { "epoch": 0.92448112028007, "grad_norm": 0.9098005294799805, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14788, "tokens_per_second_per_gpu": 9771.06, "total_tokens": 1459889671 }, { "epoch": 0.9245436359089773, "grad_norm": 0.8788864016532898, "learning_rate": 2e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14789, "tokens_per_second_per_gpu": 10313.21, "total_tokens": 1459988672 }, { "epoch": 0.9246061515378845, "grad_norm": 0.8855826258659363, "learning_rate": 2e-05, "loss": 0.6573, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14790, "tokens_per_second_per_gpu": 11438.07, "total_tokens": 1460091127 }, { "epoch": 0.9246686671667917, "grad_norm": 0.9382832050323486, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14791, "tokens_per_second_per_gpu": 9920.9, "total_tokens": 1460184963 }, { "epoch": 0.9247311827956989, "grad_norm": 0.933074951171875, "learning_rate": 2e-05, "loss": 0.5914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14792, "tokens_per_second_per_gpu": 10694.91, "total_tokens": 1460281638 }, { "epoch": 0.9247936984246061, "grad_norm": 0.8739075064659119, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14793, "tokens_per_second_per_gpu": 10663.3, "total_tokens": 1460383131 }, { "epoch": 0.9248562140535134, "grad_norm": 0.9053231477737427, "learning_rate": 2e-05, "loss": 0.6022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14794, "tokens_per_second_per_gpu": 11243.38, "total_tokens": 1460487309 }, { "epoch": 0.9249187296824206, "grad_norm": 0.9508957266807556, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14795, "tokens_per_second_per_gpu": 10133.53, "total_tokens": 1460588849 }, { "epoch": 0.9249812453113279, "grad_norm": 0.9234967231750488, "learning_rate": 2e-05, "loss": 0.5787, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14796, "tokens_per_second_per_gpu": 8601.79, "total_tokens": 1460673205 }, { "epoch": 0.9250437609402351, "grad_norm": 0.8955398201942444, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14797, "tokens_per_second_per_gpu": 11130.69, "total_tokens": 1460776537 }, { "epoch": 0.9251062765691422, "grad_norm": 0.9024887084960938, "learning_rate": 2e-05, "loss": 0.6606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14798, "tokens_per_second_per_gpu": 11169.49, "total_tokens": 1460880362 }, { "epoch": 0.9251687921980495, "grad_norm": 0.8881276249885559, "learning_rate": 2e-05, "loss": 0.6763, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14799, "tokens_per_second_per_gpu": 11230.66, "total_tokens": 1460983830 }, { "epoch": 0.9252313078269567, "grad_norm": 0.9163209795951843, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14800, "tokens_per_second_per_gpu": 10654.24, "total_tokens": 1461083602 }, { "epoch": 0.925293823455864, "grad_norm": 0.8943377137184143, "learning_rate": 2e-05, "loss": 0.642, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14801, "tokens_per_second_per_gpu": 10456.17, "total_tokens": 1461181471 }, { "epoch": 0.9253563390847712, "grad_norm": 0.8809317350387573, "learning_rate": 2e-05, "loss": 0.5757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14802, "tokens_per_second_per_gpu": 9974.98, "total_tokens": 1461278910 }, { "epoch": 0.9254188547136785, "grad_norm": 0.9080274105072021, "learning_rate": 2e-05, "loss": 0.5871, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14803, "tokens_per_second_per_gpu": 10732.4, "total_tokens": 1461377652 }, { "epoch": 0.9254813703425856, "grad_norm": 0.8797262907028198, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14804, "tokens_per_second_per_gpu": 10394.32, "total_tokens": 1461477026 }, { "epoch": 0.9255438859714928, "grad_norm": 0.8820655941963196, "learning_rate": 2e-05, "loss": 0.5872, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14805, "tokens_per_second_per_gpu": 10400.32, "total_tokens": 1461577498 }, { "epoch": 0.9256064016004001, "grad_norm": 0.864598274230957, "learning_rate": 2e-05, "loss": 0.6145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14806, "tokens_per_second_per_gpu": 10763.83, "total_tokens": 1461680280 }, { "epoch": 0.9256689172293073, "grad_norm": 0.890894889831543, "learning_rate": 2e-05, "loss": 0.603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14807, "tokens_per_second_per_gpu": 10431.31, "total_tokens": 1461778037 }, { "epoch": 0.9257314328582146, "grad_norm": 0.8830973505973816, "learning_rate": 2e-05, "loss": 0.5927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14808, "tokens_per_second_per_gpu": 10067.3, "total_tokens": 1461876322 }, { "epoch": 0.9257939484871218, "grad_norm": 0.8976466059684753, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14809, "tokens_per_second_per_gpu": 10849.55, "total_tokens": 1461977929 }, { "epoch": 0.925856464116029, "grad_norm": 0.9103195667266846, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14810, "tokens_per_second_per_gpu": 10874.74, "total_tokens": 1462077793 }, { "epoch": 0.9259189797449362, "grad_norm": 0.8772385120391846, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14811, "tokens_per_second_per_gpu": 10821.29, "total_tokens": 1462179577 }, { "epoch": 0.9259814953738434, "grad_norm": 0.9117776155471802, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14812, "tokens_per_second_per_gpu": 10587.72, "total_tokens": 1462275950 }, { "epoch": 0.9260440110027507, "grad_norm": 0.8703238368034363, "learning_rate": 2e-05, "loss": 0.6114, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14813, "tokens_per_second_per_gpu": 11089.4, "total_tokens": 1462375259 }, { "epoch": 0.9261065266316579, "grad_norm": 0.8866369128227234, "learning_rate": 2e-05, "loss": 0.5853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14814, "tokens_per_second_per_gpu": 9860.29, "total_tokens": 1462472691 }, { "epoch": 0.9261690422605652, "grad_norm": 0.9162746071815491, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14815, "tokens_per_second_per_gpu": 10397.06, "total_tokens": 1462564562 }, { "epoch": 0.9262315578894724, "grad_norm": 0.9109216928482056, "learning_rate": 2e-05, "loss": 0.6523, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14816, "tokens_per_second_per_gpu": 10542.9, "total_tokens": 1462664435 }, { "epoch": 0.9262940735183796, "grad_norm": 0.9040337800979614, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14817, "tokens_per_second_per_gpu": 11226.46, "total_tokens": 1462766882 }, { "epoch": 0.9263565891472868, "grad_norm": 0.8633551597595215, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14818, "tokens_per_second_per_gpu": 11293.5, "total_tokens": 1462870196 }, { "epoch": 0.926419104776194, "grad_norm": 0.8930957317352295, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14819, "tokens_per_second_per_gpu": 10710.4, "total_tokens": 1462971278 }, { "epoch": 0.9264816204051013, "grad_norm": 0.8885605931282043, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14820, "tokens_per_second_per_gpu": 11095.73, "total_tokens": 1463073086 }, { "epoch": 0.9265441360340085, "grad_norm": 0.9010046124458313, "learning_rate": 2e-05, "loss": 0.6082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14821, "tokens_per_second_per_gpu": 10954.05, "total_tokens": 1463175013 }, { "epoch": 0.9266066516629158, "grad_norm": 0.870802640914917, "learning_rate": 2e-05, "loss": 0.5902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14822, "tokens_per_second_per_gpu": 11099.16, "total_tokens": 1463277044 }, { "epoch": 0.9266691672918229, "grad_norm": 0.8648938536643982, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14823, "tokens_per_second_per_gpu": 10702.24, "total_tokens": 1463374718 }, { "epoch": 0.9267316829207302, "grad_norm": 0.8467553853988647, "learning_rate": 2e-05, "loss": 0.5993, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14824, "tokens_per_second_per_gpu": 11257.61, "total_tokens": 1463476841 }, { "epoch": 0.9267941985496374, "grad_norm": 0.8867603540420532, "learning_rate": 2e-05, "loss": 0.6378, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14825, "tokens_per_second_per_gpu": 10160.02, "total_tokens": 1463575856 }, { "epoch": 0.9268567141785446, "grad_norm": 0.8513668179512024, "learning_rate": 2e-05, "loss": 0.5859, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14826, "tokens_per_second_per_gpu": 10276.86, "total_tokens": 1463676727 }, { "epoch": 0.9269192298074519, "grad_norm": 0.9008662104606628, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14827, "tokens_per_second_per_gpu": 10554.77, "total_tokens": 1463773705 }, { "epoch": 0.9269817454363591, "grad_norm": 0.9242244958877563, "learning_rate": 2e-05, "loss": 0.5984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14828, "tokens_per_second_per_gpu": 10356.64, "total_tokens": 1463870524 }, { "epoch": 0.9270442610652663, "grad_norm": 0.9129162430763245, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14829, "tokens_per_second_per_gpu": 10527.94, "total_tokens": 1463970973 }, { "epoch": 0.9271067766941735, "grad_norm": 0.9042574763298035, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14830, "tokens_per_second_per_gpu": 10486.27, "total_tokens": 1464068893 }, { "epoch": 0.9271692923230808, "grad_norm": 0.8659193515777588, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14831, "tokens_per_second_per_gpu": 11292.75, "total_tokens": 1464169728 }, { "epoch": 0.927231807951988, "grad_norm": 0.8742033839225769, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14832, "tokens_per_second_per_gpu": 10795.38, "total_tokens": 1464270586 }, { "epoch": 0.9272943235808953, "grad_norm": 0.8503315448760986, "learning_rate": 2e-05, "loss": 0.5762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14833, "tokens_per_second_per_gpu": 9952.82, "total_tokens": 1464367287 }, { "epoch": 0.9273568392098025, "grad_norm": 0.8892907500267029, "learning_rate": 2e-05, "loss": 0.6469, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14834, "tokens_per_second_per_gpu": 11894.92, "total_tokens": 1464471468 }, { "epoch": 0.9274193548387096, "grad_norm": 0.8845133185386658, "learning_rate": 2e-05, "loss": 0.5961, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14835, "tokens_per_second_per_gpu": 11258.72, "total_tokens": 1464572418 }, { "epoch": 0.9274818704676169, "grad_norm": 0.8462642431259155, "learning_rate": 2e-05, "loss": 0.5662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14836, "tokens_per_second_per_gpu": 9721.08, "total_tokens": 1464670423 }, { "epoch": 0.9275443860965241, "grad_norm": 0.9176749587059021, "learning_rate": 2e-05, "loss": 0.5731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14837, "tokens_per_second_per_gpu": 9733.52, "total_tokens": 1464763691 }, { "epoch": 0.9276069017254314, "grad_norm": 0.8765305876731873, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14838, "tokens_per_second_per_gpu": 10671.68, "total_tokens": 1464865003 }, { "epoch": 0.9276694173543386, "grad_norm": 0.8853366374969482, "learning_rate": 2e-05, "loss": 0.5963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14839, "tokens_per_second_per_gpu": 11431.25, "total_tokens": 1464965481 }, { "epoch": 0.9277319329832459, "grad_norm": 0.8989433646202087, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14840, "tokens_per_second_per_gpu": 9994.05, "total_tokens": 1465061023 }, { "epoch": 0.927794448612153, "grad_norm": 0.8849348425865173, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14841, "tokens_per_second_per_gpu": 10355.76, "total_tokens": 1465160481 }, { "epoch": 0.9278569642410602, "grad_norm": 0.8668153285980225, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14842, "tokens_per_second_per_gpu": 10593.76, "total_tokens": 1465259348 }, { "epoch": 0.9279194798699675, "grad_norm": 0.8659926652908325, "learning_rate": 2e-05, "loss": 0.5922, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14843, "tokens_per_second_per_gpu": 11416.74, "total_tokens": 1465361083 }, { "epoch": 0.9279819954988747, "grad_norm": 0.9046767354011536, "learning_rate": 2e-05, "loss": 0.6608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14844, "tokens_per_second_per_gpu": 11174.21, "total_tokens": 1465461797 }, { "epoch": 0.928044511127782, "grad_norm": 0.8681780695915222, "learning_rate": 2e-05, "loss": 0.5539, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14845, "tokens_per_second_per_gpu": 9554.56, "total_tokens": 1465558183 }, { "epoch": 0.9281070267566892, "grad_norm": 0.8654607534408569, "learning_rate": 2e-05, "loss": 0.61, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14846, "tokens_per_second_per_gpu": 10237.71, "total_tokens": 1465656537 }, { "epoch": 0.9281695423855963, "grad_norm": 0.8630863428115845, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14847, "tokens_per_second_per_gpu": 10457.69, "total_tokens": 1465756977 }, { "epoch": 0.9282320580145036, "grad_norm": 0.8309266567230225, "learning_rate": 2e-05, "loss": 0.567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14848, "tokens_per_second_per_gpu": 10773.53, "total_tokens": 1465853388 }, { "epoch": 0.9282945736434108, "grad_norm": 0.9026180505752563, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14849, "tokens_per_second_per_gpu": 10793.92, "total_tokens": 1465952062 }, { "epoch": 0.9283570892723181, "grad_norm": 0.8971503973007202, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14850, "tokens_per_second_per_gpu": 11226.33, "total_tokens": 1466053235 }, { "epoch": 0.9284196049012253, "grad_norm": 0.9192681312561035, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14851, "tokens_per_second_per_gpu": 9732.15, "total_tokens": 1466148039 }, { "epoch": 0.9284821205301326, "grad_norm": 0.9318327903747559, "learning_rate": 2e-05, "loss": 0.6195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14852, "tokens_per_second_per_gpu": 10361.68, "total_tokens": 1466246605 }, { "epoch": 0.9285446361590397, "grad_norm": 0.8808906078338623, "learning_rate": 2e-05, "loss": 0.585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14853, "tokens_per_second_per_gpu": 10749.33, "total_tokens": 1466346166 }, { "epoch": 0.928607151787947, "grad_norm": 0.9017652273178101, "learning_rate": 2e-05, "loss": 0.5823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14854, "tokens_per_second_per_gpu": 11050.96, "total_tokens": 1466446031 }, { "epoch": 0.9286696674168542, "grad_norm": 0.8975799679756165, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14855, "tokens_per_second_per_gpu": 10038.99, "total_tokens": 1466544523 }, { "epoch": 0.9287321830457614, "grad_norm": 0.8802412748336792, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14856, "tokens_per_second_per_gpu": 9984.48, "total_tokens": 1466643498 }, { "epoch": 0.9287946986746687, "grad_norm": 0.9079053401947021, "learning_rate": 2e-05, "loss": 0.6013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14857, "tokens_per_second_per_gpu": 9721.62, "total_tokens": 1466740126 }, { "epoch": 0.9288572143035759, "grad_norm": 0.8891069889068604, "learning_rate": 2e-05, "loss": 0.5844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14858, "tokens_per_second_per_gpu": 10395.93, "total_tokens": 1466835886 }, { "epoch": 0.9289197299324832, "grad_norm": 0.9099606871604919, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14859, "tokens_per_second_per_gpu": 10556.33, "total_tokens": 1466933052 }, { "epoch": 0.9289822455613903, "grad_norm": 0.8908995985984802, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14860, "tokens_per_second_per_gpu": 10910.68, "total_tokens": 1467033774 }, { "epoch": 0.9290447611902976, "grad_norm": 0.879465639591217, "learning_rate": 2e-05, "loss": 0.6187, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14861, "tokens_per_second_per_gpu": 10832.64, "total_tokens": 1467136529 }, { "epoch": 0.9291072768192048, "grad_norm": 0.9238522052764893, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14862, "tokens_per_second_per_gpu": 10035.59, "total_tokens": 1467236688 }, { "epoch": 0.929169792448112, "grad_norm": 0.8528492450714111, "learning_rate": 2e-05, "loss": 0.5935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14863, "tokens_per_second_per_gpu": 11255.5, "total_tokens": 1467339267 }, { "epoch": 0.9292323080770193, "grad_norm": 0.874310314655304, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14864, "tokens_per_second_per_gpu": 10043.24, "total_tokens": 1467440361 }, { "epoch": 0.9292948237059265, "grad_norm": 0.8462719321250916, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14865, "tokens_per_second_per_gpu": 10610.98, "total_tokens": 1467541438 }, { "epoch": 0.9293573393348337, "grad_norm": 0.9271491765975952, "learning_rate": 2e-05, "loss": 0.6168, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14866, "tokens_per_second_per_gpu": 9897.52, "total_tokens": 1467634097 }, { "epoch": 0.9294198549637409, "grad_norm": 0.9151625037193298, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14867, "tokens_per_second_per_gpu": 10591.28, "total_tokens": 1467733593 }, { "epoch": 0.9294823705926482, "grad_norm": 0.888860821723938, "learning_rate": 2e-05, "loss": 0.6329, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14868, "tokens_per_second_per_gpu": 11132.99, "total_tokens": 1467835598 }, { "epoch": 0.9295448862215554, "grad_norm": 0.8918739557266235, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14869, "tokens_per_second_per_gpu": 10395.2, "total_tokens": 1467933429 }, { "epoch": 0.9296074018504626, "grad_norm": 0.870888888835907, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14870, "tokens_per_second_per_gpu": 10310.93, "total_tokens": 1468029958 }, { "epoch": 0.9296699174793699, "grad_norm": 0.8932607173919678, "learning_rate": 2e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14871, "tokens_per_second_per_gpu": 10853.64, "total_tokens": 1468128675 }, { "epoch": 0.929732433108277, "grad_norm": 0.8552353978157043, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14872, "tokens_per_second_per_gpu": 11414.22, "total_tokens": 1468235275 }, { "epoch": 0.9297949487371843, "grad_norm": 0.8656431436538696, "learning_rate": 2e-05, "loss": 0.662, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14873, "tokens_per_second_per_gpu": 10035.79, "total_tokens": 1468336975 }, { "epoch": 0.9298574643660915, "grad_norm": 0.9572346806526184, "learning_rate": 2e-05, "loss": 0.5957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14874, "tokens_per_second_per_gpu": 10074.26, "total_tokens": 1468434606 }, { "epoch": 0.9299199799949988, "grad_norm": 0.8759233951568604, "learning_rate": 2e-05, "loss": 0.5847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14875, "tokens_per_second_per_gpu": 10439.33, "total_tokens": 1468531814 }, { "epoch": 0.929982495623906, "grad_norm": 0.9074166417121887, "learning_rate": 2e-05, "loss": 0.6529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14876, "tokens_per_second_per_gpu": 10468.67, "total_tokens": 1468631944 }, { "epoch": 0.9300450112528132, "grad_norm": 0.8714338541030884, "learning_rate": 2e-05, "loss": 0.6156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14877, "tokens_per_second_per_gpu": 10034.37, "total_tokens": 1468733305 }, { "epoch": 0.9301075268817204, "grad_norm": 0.9324004054069519, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14878, "tokens_per_second_per_gpu": 10288.51, "total_tokens": 1468833306 }, { "epoch": 0.9301700425106276, "grad_norm": 0.8700844645500183, "learning_rate": 2e-05, "loss": 0.6353, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14879, "tokens_per_second_per_gpu": 10738.5, "total_tokens": 1468931124 }, { "epoch": 0.9302325581395349, "grad_norm": 0.8747789263725281, "learning_rate": 2e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14880, "tokens_per_second_per_gpu": 10859.92, "total_tokens": 1469034024 }, { "epoch": 0.9302950737684421, "grad_norm": 0.8805594444274902, "learning_rate": 2e-05, "loss": 0.635, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14881, "tokens_per_second_per_gpu": 11031.88, "total_tokens": 1469137577 }, { "epoch": 0.9303575893973494, "grad_norm": 0.888908863067627, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14882, "tokens_per_second_per_gpu": 10678.51, "total_tokens": 1469239156 }, { "epoch": 0.9304201050262566, "grad_norm": 0.8923923373222351, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14883, "tokens_per_second_per_gpu": 10822.78, "total_tokens": 1469334685 }, { "epoch": 0.9304826206551637, "grad_norm": 0.8865216970443726, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14884, "tokens_per_second_per_gpu": 11170.37, "total_tokens": 1469433952 }, { "epoch": 0.930545136284071, "grad_norm": 0.8869566321372986, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14885, "tokens_per_second_per_gpu": 11360.0, "total_tokens": 1469536183 }, { "epoch": 0.9306076519129782, "grad_norm": 0.8805161118507385, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14886, "tokens_per_second_per_gpu": 10687.63, "total_tokens": 1469634980 }, { "epoch": 0.9306701675418855, "grad_norm": 0.8770242929458618, "learning_rate": 2e-05, "loss": 0.5931, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14887, "tokens_per_second_per_gpu": 10522.81, "total_tokens": 1469735719 }, { "epoch": 0.9307326831707927, "grad_norm": 0.8708507418632507, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14888, "tokens_per_second_per_gpu": 10759.85, "total_tokens": 1469836810 }, { "epoch": 0.9307951987997, "grad_norm": 0.8904533386230469, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14889, "tokens_per_second_per_gpu": 11172.58, "total_tokens": 1469937884 }, { "epoch": 0.9308577144286071, "grad_norm": 0.8724551796913147, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14890, "tokens_per_second_per_gpu": 10373.51, "total_tokens": 1470034657 }, { "epoch": 0.9309202300575143, "grad_norm": 0.8799554705619812, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14891, "tokens_per_second_per_gpu": 10403.26, "total_tokens": 1470129630 }, { "epoch": 0.9309827456864216, "grad_norm": 0.8579647541046143, "learning_rate": 2e-05, "loss": 0.6524, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14892, "tokens_per_second_per_gpu": 10765.24, "total_tokens": 1470230654 }, { "epoch": 0.9310452613153288, "grad_norm": 0.8576148748397827, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14893, "tokens_per_second_per_gpu": 10333.76, "total_tokens": 1470331768 }, { "epoch": 0.9311077769442361, "grad_norm": 0.8689904808998108, "learning_rate": 2e-05, "loss": 0.5869, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14894, "tokens_per_second_per_gpu": 11076.67, "total_tokens": 1470430109 }, { "epoch": 0.9311702925731433, "grad_norm": 0.9398348331451416, "learning_rate": 2e-05, "loss": 0.6682, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14895, "tokens_per_second_per_gpu": 8854.69, "total_tokens": 1470523670 }, { "epoch": 0.9312328082020506, "grad_norm": 0.8709431886672974, "learning_rate": 2e-05, "loss": 0.6011, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14896, "tokens_per_second_per_gpu": 11069.65, "total_tokens": 1470625470 }, { "epoch": 0.9312953238309577, "grad_norm": 0.9218832850456238, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14897, "tokens_per_second_per_gpu": 10730.36, "total_tokens": 1470726487 }, { "epoch": 0.931357839459865, "grad_norm": 0.862824559211731, "learning_rate": 2e-05, "loss": 0.6012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14898, "tokens_per_second_per_gpu": 11145.54, "total_tokens": 1470827946 }, { "epoch": 0.9314203550887722, "grad_norm": 0.8710941076278687, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14899, "tokens_per_second_per_gpu": 10630.91, "total_tokens": 1470924514 }, { "epoch": 0.9314828707176794, "grad_norm": 0.8885777592658997, "learning_rate": 2e-05, "loss": 0.6652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14900, "tokens_per_second_per_gpu": 10060.19, "total_tokens": 1471026433 }, { "epoch": 0.9315453863465867, "grad_norm": 0.8664413094520569, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14901, "tokens_per_second_per_gpu": 11166.0, "total_tokens": 1471128647 }, { "epoch": 0.9316079019754939, "grad_norm": 0.9400983452796936, "learning_rate": 2e-05, "loss": 0.5804, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14902, "tokens_per_second_per_gpu": 9746.35, "total_tokens": 1471220553 }, { "epoch": 0.9316704176044011, "grad_norm": 0.8995975255966187, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14903, "tokens_per_second_per_gpu": 10856.1, "total_tokens": 1471318915 }, { "epoch": 0.9317329332333083, "grad_norm": 0.9029720425605774, "learning_rate": 2e-05, "loss": 0.5834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14904, "tokens_per_second_per_gpu": 10387.75, "total_tokens": 1471416705 }, { "epoch": 0.9317954488622155, "grad_norm": 0.9007397294044495, "learning_rate": 2e-05, "loss": 0.6022, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14905, "tokens_per_second_per_gpu": 10814.19, "total_tokens": 1471515042 }, { "epoch": 0.9318579644911228, "grad_norm": 0.9736199378967285, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14906, "tokens_per_second_per_gpu": 11452.21, "total_tokens": 1471615472 }, { "epoch": 0.93192048012003, "grad_norm": 0.9019030332565308, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14907, "tokens_per_second_per_gpu": 11657.73, "total_tokens": 1471718323 }, { "epoch": 0.9319829957489373, "grad_norm": 0.9288674592971802, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14908, "tokens_per_second_per_gpu": 10341.63, "total_tokens": 1471816739 }, { "epoch": 0.9320455113778444, "grad_norm": 0.9031699299812317, "learning_rate": 2e-05, "loss": 0.6205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14909, "tokens_per_second_per_gpu": 10949.86, "total_tokens": 1471918724 }, { "epoch": 0.9321080270067517, "grad_norm": 0.9696144461631775, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14910, "tokens_per_second_per_gpu": 10023.09, "total_tokens": 1472015018 }, { "epoch": 0.9321705426356589, "grad_norm": 0.906387209892273, "learning_rate": 2e-05, "loss": 0.6156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14911, "tokens_per_second_per_gpu": 10667.08, "total_tokens": 1472115279 }, { "epoch": 0.9322330582645662, "grad_norm": 0.9478656649589539, "learning_rate": 2e-05, "loss": 0.6438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14912, "tokens_per_second_per_gpu": 11082.12, "total_tokens": 1472215766 }, { "epoch": 0.9322955738934734, "grad_norm": 0.8923014998435974, "learning_rate": 2e-05, "loss": 0.5916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14913, "tokens_per_second_per_gpu": 9667.6, "total_tokens": 1472310266 }, { "epoch": 0.9323580895223806, "grad_norm": 0.8936993479728699, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14914, "tokens_per_second_per_gpu": 10214.5, "total_tokens": 1472410003 }, { "epoch": 0.9324206051512878, "grad_norm": 0.8752124905586243, "learning_rate": 2e-05, "loss": 0.5729, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14915, "tokens_per_second_per_gpu": 10348.31, "total_tokens": 1472510298 }, { "epoch": 0.932483120780195, "grad_norm": 0.8641965389251709, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14916, "tokens_per_second_per_gpu": 10704.97, "total_tokens": 1472612793 }, { "epoch": 0.9325456364091023, "grad_norm": 0.9179131984710693, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14917, "tokens_per_second_per_gpu": 10527.61, "total_tokens": 1472708823 }, { "epoch": 0.9326081520380095, "grad_norm": 0.8935728073120117, "learning_rate": 2e-05, "loss": 0.5871, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14918, "tokens_per_second_per_gpu": 10091.28, "total_tokens": 1472804322 }, { "epoch": 0.9326706676669168, "grad_norm": 0.9081757664680481, "learning_rate": 2e-05, "loss": 0.5974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14919, "tokens_per_second_per_gpu": 10552.68, "total_tokens": 1472901837 }, { "epoch": 0.932733183295824, "grad_norm": 0.8641599416732788, "learning_rate": 2e-05, "loss": 0.5761, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14920, "tokens_per_second_per_gpu": 10924.19, "total_tokens": 1473000442 }, { "epoch": 0.9327956989247311, "grad_norm": 0.905470609664917, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14921, "tokens_per_second_per_gpu": 10270.61, "total_tokens": 1473097550 }, { "epoch": 0.9328582145536384, "grad_norm": 0.8785147666931152, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14922, "tokens_per_second_per_gpu": 11062.89, "total_tokens": 1473198793 }, { "epoch": 0.9329207301825456, "grad_norm": 0.9175630211830139, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14923, "tokens_per_second_per_gpu": 10564.46, "total_tokens": 1473298041 }, { "epoch": 0.9329832458114529, "grad_norm": 0.8898429274559021, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14924, "tokens_per_second_per_gpu": 10248.31, "total_tokens": 1473396861 }, { "epoch": 0.9330457614403601, "grad_norm": 0.8692377209663391, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14925, "tokens_per_second_per_gpu": 10813.08, "total_tokens": 1473499231 }, { "epoch": 0.9331082770692674, "grad_norm": 0.8726300001144409, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14926, "tokens_per_second_per_gpu": 11078.71, "total_tokens": 1473597394 }, { "epoch": 0.9331707926981745, "grad_norm": 0.8883563876152039, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14927, "tokens_per_second_per_gpu": 9904.77, "total_tokens": 1473692868 }, { "epoch": 0.9332333083270817, "grad_norm": 0.8988426923751831, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14928, "tokens_per_second_per_gpu": 10737.4, "total_tokens": 1473790264 }, { "epoch": 0.933295823955989, "grad_norm": 0.8893373012542725, "learning_rate": 2e-05, "loss": 0.622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14929, "tokens_per_second_per_gpu": 10942.83, "total_tokens": 1473891077 }, { "epoch": 0.9333583395848962, "grad_norm": 0.9149094820022583, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14930, "tokens_per_second_per_gpu": 9936.51, "total_tokens": 1473987529 }, { "epoch": 0.9334208552138035, "grad_norm": 0.8950507640838623, "learning_rate": 2e-05, "loss": 0.6195, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14931, "tokens_per_second_per_gpu": 10636.6, "total_tokens": 1474086930 }, { "epoch": 0.9334833708427107, "grad_norm": 0.9013180732727051, "learning_rate": 2e-05, "loss": 0.6156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14932, "tokens_per_second_per_gpu": 10020.61, "total_tokens": 1474186626 }, { "epoch": 0.933545886471618, "grad_norm": 0.885892391204834, "learning_rate": 2e-05, "loss": 0.6681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14933, "tokens_per_second_per_gpu": 10929.32, "total_tokens": 1474290369 }, { "epoch": 0.9336084021005251, "grad_norm": 0.872653067111969, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14934, "tokens_per_second_per_gpu": 11063.03, "total_tokens": 1474393465 }, { "epoch": 0.9336709177294323, "grad_norm": 0.8886120915412903, "learning_rate": 2e-05, "loss": 0.5852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14935, "tokens_per_second_per_gpu": 9749.92, "total_tokens": 1474488621 }, { "epoch": 0.9337334333583396, "grad_norm": 0.8694396018981934, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14936, "tokens_per_second_per_gpu": 10947.35, "total_tokens": 1474589956 }, { "epoch": 0.9337959489872468, "grad_norm": 0.8877205848693848, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14937, "tokens_per_second_per_gpu": 10377.84, "total_tokens": 1474687001 }, { "epoch": 0.9338584646161541, "grad_norm": 0.91359943151474, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14938, "tokens_per_second_per_gpu": 10799.3, "total_tokens": 1474789784 }, { "epoch": 0.9339209802450613, "grad_norm": 0.8601431250572205, "learning_rate": 2e-05, "loss": 0.5775, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14939, "tokens_per_second_per_gpu": 9925.38, "total_tokens": 1474887102 }, { "epoch": 0.9339834958739685, "grad_norm": 0.896091878414154, "learning_rate": 2e-05, "loss": 0.681, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14940, "tokens_per_second_per_gpu": 11072.53, "total_tokens": 1474994215 }, { "epoch": 0.9340460115028757, "grad_norm": 0.8808904886245728, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14941, "tokens_per_second_per_gpu": 11590.17, "total_tokens": 1475095732 }, { "epoch": 0.9341085271317829, "grad_norm": 0.8542319536209106, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14942, "tokens_per_second_per_gpu": 11290.44, "total_tokens": 1475197477 }, { "epoch": 0.9341710427606902, "grad_norm": 0.8926435708999634, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14943, "tokens_per_second_per_gpu": 10331.61, "total_tokens": 1475294059 }, { "epoch": 0.9342335583895974, "grad_norm": 0.8986483812332153, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14944, "tokens_per_second_per_gpu": 10417.85, "total_tokens": 1475392735 }, { "epoch": 0.9342960740185047, "grad_norm": 0.9139759540557861, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14945, "tokens_per_second_per_gpu": 10359.98, "total_tokens": 1475488080 }, { "epoch": 0.9343585896474118, "grad_norm": 0.8877946138381958, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14946, "tokens_per_second_per_gpu": 10812.79, "total_tokens": 1475589279 }, { "epoch": 0.934421105276319, "grad_norm": 0.9293942451477051, "learning_rate": 2e-05, "loss": 0.5964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14947, "tokens_per_second_per_gpu": 10063.3, "total_tokens": 1475685579 }, { "epoch": 0.9344836209052263, "grad_norm": 0.8641296625137329, "learning_rate": 2e-05, "loss": 0.6002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14948, "tokens_per_second_per_gpu": 10336.09, "total_tokens": 1475787629 }, { "epoch": 0.9345461365341335, "grad_norm": 0.9291514158248901, "learning_rate": 2e-05, "loss": 0.597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14949, "tokens_per_second_per_gpu": 10598.46, "total_tokens": 1475887446 }, { "epoch": 0.9346086521630408, "grad_norm": 0.9001958966255188, "learning_rate": 2e-05, "loss": 0.5911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14950, "tokens_per_second_per_gpu": 10393.3, "total_tokens": 1475984538 }, { "epoch": 0.934671167791948, "grad_norm": 0.8829813003540039, "learning_rate": 2e-05, "loss": 0.6203, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14951, "tokens_per_second_per_gpu": 10741.91, "total_tokens": 1476083399 }, { "epoch": 0.9347336834208552, "grad_norm": 0.9117534160614014, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14952, "tokens_per_second_per_gpu": 10699.67, "total_tokens": 1476177967 }, { "epoch": 0.9347961990497624, "grad_norm": 0.8626015186309814, "learning_rate": 2e-05, "loss": 0.5769, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14953, "tokens_per_second_per_gpu": 10418.89, "total_tokens": 1476276450 }, { "epoch": 0.9348587146786697, "grad_norm": 0.9005811810493469, "learning_rate": 2e-05, "loss": 0.5748, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14954, "tokens_per_second_per_gpu": 11033.07, "total_tokens": 1476378306 }, { "epoch": 0.9349212303075769, "grad_norm": 0.9195430278778076, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14955, "tokens_per_second_per_gpu": 10513.31, "total_tokens": 1476478942 }, { "epoch": 0.9349837459364841, "grad_norm": 0.8937172889709473, "learning_rate": 2e-05, "loss": 0.5962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14956, "tokens_per_second_per_gpu": 10566.48, "total_tokens": 1476581009 }, { "epoch": 0.9350462615653914, "grad_norm": 0.8946530818939209, "learning_rate": 2e-05, "loss": 0.6004, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14957, "tokens_per_second_per_gpu": 10206.92, "total_tokens": 1476679776 }, { "epoch": 0.9351087771942985, "grad_norm": 0.8859313130378723, "learning_rate": 2e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14958, "tokens_per_second_per_gpu": 10845.76, "total_tokens": 1476781873 }, { "epoch": 0.9351712928232058, "grad_norm": 0.8825061917304993, "learning_rate": 2e-05, "loss": 0.5852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14959, "tokens_per_second_per_gpu": 10491.55, "total_tokens": 1476881694 }, { "epoch": 0.935233808452113, "grad_norm": 0.9488799571990967, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14960, "tokens_per_second_per_gpu": 10697.45, "total_tokens": 1476978415 }, { "epoch": 0.9352963240810203, "grad_norm": 0.904158890247345, "learning_rate": 2e-05, "loss": 0.6402, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14961, "tokens_per_second_per_gpu": 11059.65, "total_tokens": 1477080937 }, { "epoch": 0.9353588397099275, "grad_norm": 0.9416583776473999, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14962, "tokens_per_second_per_gpu": 10242.18, "total_tokens": 1477174573 }, { "epoch": 0.9354213553388347, "grad_norm": 0.8986179232597351, "learning_rate": 2e-05, "loss": 0.5955, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14963, "tokens_per_second_per_gpu": 11114.45, "total_tokens": 1477276099 }, { "epoch": 0.9354838709677419, "grad_norm": 0.8883184194564819, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14964, "tokens_per_second_per_gpu": 10300.06, "total_tokens": 1477375870 }, { "epoch": 0.9355463865966491, "grad_norm": 0.8970866203308105, "learning_rate": 2e-05, "loss": 0.5678, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14965, "tokens_per_second_per_gpu": 10902.88, "total_tokens": 1477471961 }, { "epoch": 0.9356089022255564, "grad_norm": 0.8774704933166504, "learning_rate": 2e-05, "loss": 0.5923, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14966, "tokens_per_second_per_gpu": 10216.04, "total_tokens": 1477565505 }, { "epoch": 0.9356714178544636, "grad_norm": 0.9065943360328674, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14967, "tokens_per_second_per_gpu": 9806.48, "total_tokens": 1477661553 }, { "epoch": 0.9357339334833709, "grad_norm": 0.8942015171051025, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14968, "tokens_per_second_per_gpu": 10776.42, "total_tokens": 1477761249 }, { "epoch": 0.9357964491122781, "grad_norm": 0.9293606877326965, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14969, "tokens_per_second_per_gpu": 10362.59, "total_tokens": 1477860364 }, { "epoch": 0.9358589647411854, "grad_norm": 0.9034631848335266, "learning_rate": 2e-05, "loss": 0.5867, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14970, "tokens_per_second_per_gpu": 10244.97, "total_tokens": 1477958370 }, { "epoch": 0.9359214803700925, "grad_norm": 0.8800457119941711, "learning_rate": 2e-05, "loss": 0.5868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14971, "tokens_per_second_per_gpu": 10652.85, "total_tokens": 1478055522 }, { "epoch": 0.9359839959989997, "grad_norm": 0.8582823276519775, "learning_rate": 2e-05, "loss": 0.6238, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14972, "tokens_per_second_per_gpu": 11553.89, "total_tokens": 1478158182 }, { "epoch": 0.936046511627907, "grad_norm": 0.8609066009521484, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14973, "tokens_per_second_per_gpu": 10588.9, "total_tokens": 1478258026 }, { "epoch": 0.9361090272568142, "grad_norm": 0.9197193384170532, "learning_rate": 2e-05, "loss": 0.6178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14974, "tokens_per_second_per_gpu": 11759.16, "total_tokens": 1478359806 }, { "epoch": 0.9361715428857215, "grad_norm": 0.917182445526123, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14975, "tokens_per_second_per_gpu": 11061.85, "total_tokens": 1478459032 }, { "epoch": 0.9362340585146287, "grad_norm": 0.916812539100647, "learning_rate": 2e-05, "loss": 0.6209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14976, "tokens_per_second_per_gpu": 10329.31, "total_tokens": 1478554024 }, { "epoch": 0.9362965741435358, "grad_norm": 0.935624361038208, "learning_rate": 2e-05, "loss": 0.5738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14977, "tokens_per_second_per_gpu": 9493.17, "total_tokens": 1478646979 }, { "epoch": 0.9363590897724431, "grad_norm": 0.922989547252655, "learning_rate": 2e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14978, "tokens_per_second_per_gpu": 10519.28, "total_tokens": 1478743133 }, { "epoch": 0.9364216054013503, "grad_norm": 0.8812767863273621, "learning_rate": 2e-05, "loss": 0.5966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14979, "tokens_per_second_per_gpu": 9901.34, "total_tokens": 1478841957 }, { "epoch": 0.9364841210302576, "grad_norm": 0.9183963537216187, "learning_rate": 2e-05, "loss": 0.6026, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14980, "tokens_per_second_per_gpu": 10175.94, "total_tokens": 1478936946 }, { "epoch": 0.9365466366591648, "grad_norm": 0.8745080232620239, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14981, "tokens_per_second_per_gpu": 10653.92, "total_tokens": 1479035277 }, { "epoch": 0.9366091522880721, "grad_norm": 0.8950445055961609, "learning_rate": 2e-05, "loss": 0.6228, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14982, "tokens_per_second_per_gpu": 11211.28, "total_tokens": 1479139003 }, { "epoch": 0.9366716679169792, "grad_norm": 0.9032835364341736, "learning_rate": 2e-05, "loss": 0.6197, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14983, "tokens_per_second_per_gpu": 10299.14, "total_tokens": 1479237620 }, { "epoch": 0.9367341835458864, "grad_norm": 0.9225999116897583, "learning_rate": 2e-05, "loss": 0.5829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14984, "tokens_per_second_per_gpu": 10066.33, "total_tokens": 1479332009 }, { "epoch": 0.9367966991747937, "grad_norm": 0.8840008974075317, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14985, "tokens_per_second_per_gpu": 10720.69, "total_tokens": 1479434135 }, { "epoch": 0.9368592148037009, "grad_norm": 0.8903309106826782, "learning_rate": 2e-05, "loss": 0.6013, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14986, "tokens_per_second_per_gpu": 9940.77, "total_tokens": 1479527742 }, { "epoch": 0.9369217304326082, "grad_norm": 0.8792116045951843, "learning_rate": 2e-05, "loss": 0.5963, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14987, "tokens_per_second_per_gpu": 10375.19, "total_tokens": 1479628548 }, { "epoch": 0.9369842460615154, "grad_norm": 0.9463367462158203, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14988, "tokens_per_second_per_gpu": 10382.98, "total_tokens": 1479726783 }, { "epoch": 0.9370467616904226, "grad_norm": 0.9408330917358398, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14989, "tokens_per_second_per_gpu": 12206.13, "total_tokens": 1479825388 }, { "epoch": 0.9371092773193298, "grad_norm": 0.9018954038619995, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14990, "tokens_per_second_per_gpu": 12843.9, "total_tokens": 1479923389 }, { "epoch": 0.937171792948237, "grad_norm": 0.8489875793457031, "learning_rate": 2e-05, "loss": 0.5821, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14991, "tokens_per_second_per_gpu": 10203.81, "total_tokens": 1480020955 }, { "epoch": 0.9372343085771443, "grad_norm": 0.9509686231613159, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14992, "tokens_per_second_per_gpu": 10198.28, "total_tokens": 1480116498 }, { "epoch": 0.9372968242060515, "grad_norm": 0.9273594617843628, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14993, "tokens_per_second_per_gpu": 9117.42, "total_tokens": 1480211613 }, { "epoch": 0.9373593398349588, "grad_norm": 0.8849499821662903, "learning_rate": 2e-05, "loss": 0.6477, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14994, "tokens_per_second_per_gpu": 11449.58, "total_tokens": 1480314224 }, { "epoch": 0.9374218554638659, "grad_norm": 0.8954495191574097, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14995, "tokens_per_second_per_gpu": 11308.94, "total_tokens": 1480415753 }, { "epoch": 0.9374843710927732, "grad_norm": 0.8599191904067993, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14996, "tokens_per_second_per_gpu": 10567.05, "total_tokens": 1480519291 }, { "epoch": 0.9375468867216804, "grad_norm": 0.8607118725776672, "learning_rate": 2e-05, "loss": 0.5752, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14997, "tokens_per_second_per_gpu": 10290.79, "total_tokens": 1480616603 }, { "epoch": 0.9376094023505877, "grad_norm": 0.9085047841072083, "learning_rate": 2e-05, "loss": 0.5892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14998, "tokens_per_second_per_gpu": 10411.35, "total_tokens": 1480713060 }, { "epoch": 0.9376719179794949, "grad_norm": 0.8866420984268188, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 14999, "tokens_per_second_per_gpu": 10439.42, "total_tokens": 1480810043 }, { "epoch": 0.9377344336084021, "grad_norm": 0.8544318079948425, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15000, "tokens_per_second_per_gpu": 10383.27, "total_tokens": 1480912439 }, { "epoch": 0.9377969492373093, "grad_norm": 0.9126107692718506, "learning_rate": 2e-05, "loss": 0.6485, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15001, "tokens_per_second_per_gpu": 10761.08, "total_tokens": 1481014127 }, { "epoch": 0.9378594648662165, "grad_norm": 0.8815579414367676, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15002, "tokens_per_second_per_gpu": 10819.78, "total_tokens": 1481115477 }, { "epoch": 0.9379219804951238, "grad_norm": 0.922247052192688, "learning_rate": 2e-05, "loss": 0.6476, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15003, "tokens_per_second_per_gpu": 10306.36, "total_tokens": 1481211574 }, { "epoch": 0.937984496124031, "grad_norm": 0.8625620007514954, "learning_rate": 2e-05, "loss": 0.6311, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15004, "tokens_per_second_per_gpu": 10815.16, "total_tokens": 1481313107 }, { "epoch": 0.9380470117529383, "grad_norm": 0.9037025570869446, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15005, "tokens_per_second_per_gpu": 10427.86, "total_tokens": 1481409769 }, { "epoch": 0.9381095273818455, "grad_norm": 0.9513998627662659, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15006, "tokens_per_second_per_gpu": 10765.46, "total_tokens": 1481509113 }, { "epoch": 0.9381720430107527, "grad_norm": 0.9133151769638062, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15007, "tokens_per_second_per_gpu": 9928.59, "total_tokens": 1481605585 }, { "epoch": 0.9382345586396599, "grad_norm": 0.9005635380744934, "learning_rate": 2e-05, "loss": 0.6294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15008, "tokens_per_second_per_gpu": 10666.23, "total_tokens": 1481704451 }, { "epoch": 0.9382970742685671, "grad_norm": 0.9737799167633057, "learning_rate": 2e-05, "loss": 0.6039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15009, "tokens_per_second_per_gpu": 10468.63, "total_tokens": 1481801317 }, { "epoch": 0.9383595898974744, "grad_norm": 0.9083783626556396, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15010, "tokens_per_second_per_gpu": 10765.2, "total_tokens": 1481901399 }, { "epoch": 0.9384221055263816, "grad_norm": 0.8458245992660522, "learning_rate": 2e-05, "loss": 0.5889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15011, "tokens_per_second_per_gpu": 10525.82, "total_tokens": 1482001986 }, { "epoch": 0.9384846211552889, "grad_norm": 0.8669788241386414, "learning_rate": 2e-05, "loss": 0.6242, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15012, "tokens_per_second_per_gpu": 10720.05, "total_tokens": 1482104320 }, { "epoch": 0.9385471367841961, "grad_norm": 0.8924604654312134, "learning_rate": 2e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15013, "tokens_per_second_per_gpu": 10384.4, "total_tokens": 1482202115 }, { "epoch": 0.9386096524131032, "grad_norm": 0.882720947265625, "learning_rate": 2e-05, "loss": 0.6674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15014, "tokens_per_second_per_gpu": 11748.29, "total_tokens": 1482305358 }, { "epoch": 0.9386721680420105, "grad_norm": 0.8789575099945068, "learning_rate": 2e-05, "loss": 0.582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15015, "tokens_per_second_per_gpu": 10754.95, "total_tokens": 1482404227 }, { "epoch": 0.9387346836709177, "grad_norm": 0.9256453514099121, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15016, "tokens_per_second_per_gpu": 10449.41, "total_tokens": 1482499765 }, { "epoch": 0.938797199299825, "grad_norm": 0.8705299496650696, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15017, "tokens_per_second_per_gpu": 10686.36, "total_tokens": 1482597718 }, { "epoch": 0.9388597149287322, "grad_norm": 0.9411149621009827, "learning_rate": 2e-05, "loss": 0.6223, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15018, "tokens_per_second_per_gpu": 11046.34, "total_tokens": 1482693602 }, { "epoch": 0.9389222305576395, "grad_norm": 0.9338757991790771, "learning_rate": 2e-05, "loss": 0.6205, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15019, "tokens_per_second_per_gpu": 10356.55, "total_tokens": 1482792621 }, { "epoch": 0.9389847461865466, "grad_norm": 0.8790112137794495, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15020, "tokens_per_second_per_gpu": 10305.21, "total_tokens": 1482892136 }, { "epoch": 0.9390472618154538, "grad_norm": 0.9020158648490906, "learning_rate": 2e-05, "loss": 0.568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15021, "tokens_per_second_per_gpu": 10063.08, "total_tokens": 1482985413 }, { "epoch": 0.9391097774443611, "grad_norm": 0.8965180516242981, "learning_rate": 2e-05, "loss": 0.5882, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15022, "tokens_per_second_per_gpu": 10177.35, "total_tokens": 1483080687 }, { "epoch": 0.9391722930732683, "grad_norm": 0.8563303351402283, "learning_rate": 2e-05, "loss": 0.579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15023, "tokens_per_second_per_gpu": 10860.9, "total_tokens": 1483181638 }, { "epoch": 0.9392348087021756, "grad_norm": 0.9288297295570374, "learning_rate": 2e-05, "loss": 0.6272, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15024, "tokens_per_second_per_gpu": 10184.83, "total_tokens": 1483282553 }, { "epoch": 0.9392973243310828, "grad_norm": 0.8515558242797852, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15025, "tokens_per_second_per_gpu": 10802.51, "total_tokens": 1483383160 }, { "epoch": 0.93935983995999, "grad_norm": 0.8707495331764221, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15026, "tokens_per_second_per_gpu": 10847.31, "total_tokens": 1483483310 }, { "epoch": 0.9394223555888972, "grad_norm": 0.9832702875137329, "learning_rate": 2e-05, "loss": 0.6095, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15027, "tokens_per_second_per_gpu": 9758.75, "total_tokens": 1483578143 }, { "epoch": 0.9394848712178044, "grad_norm": 0.880273163318634, "learning_rate": 2e-05, "loss": 0.6045, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15028, "tokens_per_second_per_gpu": 10681.89, "total_tokens": 1483678742 }, { "epoch": 0.9395473868467117, "grad_norm": 0.9487794637680054, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15029, "tokens_per_second_per_gpu": 10410.97, "total_tokens": 1483773296 }, { "epoch": 0.9396099024756189, "grad_norm": 0.9400103688240051, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15030, "tokens_per_second_per_gpu": 10603.49, "total_tokens": 1483872273 }, { "epoch": 0.9396724181045262, "grad_norm": 0.8940619826316833, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15031, "tokens_per_second_per_gpu": 10541.55, "total_tokens": 1483971497 }, { "epoch": 0.9397349337334333, "grad_norm": 0.8978853821754456, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15032, "tokens_per_second_per_gpu": 10744.52, "total_tokens": 1484068921 }, { "epoch": 0.9397974493623406, "grad_norm": 0.9027850031852722, "learning_rate": 2e-05, "loss": 0.6483, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15033, "tokens_per_second_per_gpu": 10803.32, "total_tokens": 1484162672 }, { "epoch": 0.9398599649912478, "grad_norm": 0.8775997161865234, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15034, "tokens_per_second_per_gpu": 10390.29, "total_tokens": 1484262176 }, { "epoch": 0.939922480620155, "grad_norm": 0.8857653141021729, "learning_rate": 2e-05, "loss": 0.6078, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15035, "tokens_per_second_per_gpu": 10488.03, "total_tokens": 1484361550 }, { "epoch": 0.9399849962490623, "grad_norm": 1.0938585996627808, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15036, "tokens_per_second_per_gpu": 10374.6, "total_tokens": 1484461073 }, { "epoch": 0.9400475118779695, "grad_norm": 0.8980913758277893, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15037, "tokens_per_second_per_gpu": 10512.94, "total_tokens": 1484557858 }, { "epoch": 0.9401100275068767, "grad_norm": 0.8868497014045715, "learning_rate": 2e-05, "loss": 0.5992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15038, "tokens_per_second_per_gpu": 11663.39, "total_tokens": 1484657380 }, { "epoch": 0.9401725431357839, "grad_norm": 0.8975770473480225, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15039, "tokens_per_second_per_gpu": 10544.55, "total_tokens": 1484755223 }, { "epoch": 0.9402350587646912, "grad_norm": 0.8936448693275452, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15040, "tokens_per_second_per_gpu": 10038.75, "total_tokens": 1484856377 }, { "epoch": 0.9402975743935984, "grad_norm": 0.9614792466163635, "learning_rate": 2e-05, "loss": 0.568, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15041, "tokens_per_second_per_gpu": 10011.78, "total_tokens": 1484947811 }, { "epoch": 0.9403600900225056, "grad_norm": 0.9584652781486511, "learning_rate": 2e-05, "loss": 0.5732, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15042, "tokens_per_second_per_gpu": 10011.47, "total_tokens": 1485042064 }, { "epoch": 0.9404226056514129, "grad_norm": 0.9070960879325867, "learning_rate": 2e-05, "loss": 0.6316, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15043, "tokens_per_second_per_gpu": 11488.25, "total_tokens": 1485145423 }, { "epoch": 0.9404851212803201, "grad_norm": 0.9015987515449524, "learning_rate": 2e-05, "loss": 0.5852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15044, "tokens_per_second_per_gpu": 10101.83, "total_tokens": 1485242341 }, { "epoch": 0.9405476369092273, "grad_norm": 0.8897616863250732, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15045, "tokens_per_second_per_gpu": 11066.83, "total_tokens": 1485344030 }, { "epoch": 0.9406101525381345, "grad_norm": 0.8734344244003296, "learning_rate": 2e-05, "loss": 0.5529, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15046, "tokens_per_second_per_gpu": 9899.7, "total_tokens": 1485437744 }, { "epoch": 0.9406726681670418, "grad_norm": 0.8806923031806946, "learning_rate": 2e-05, "loss": 0.6224, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15047, "tokens_per_second_per_gpu": 9713.72, "total_tokens": 1485536603 }, { "epoch": 0.940735183795949, "grad_norm": 0.8847298622131348, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15048, "tokens_per_second_per_gpu": 9921.23, "total_tokens": 1485634626 }, { "epoch": 0.9407976994248562, "grad_norm": 0.8992968201637268, "learning_rate": 2e-05, "loss": 0.6498, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15049, "tokens_per_second_per_gpu": 9833.28, "total_tokens": 1485732754 }, { "epoch": 0.9408602150537635, "grad_norm": 0.8595081567764282, "learning_rate": 2e-05, "loss": 0.5878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15050, "tokens_per_second_per_gpu": 10930.06, "total_tokens": 1485835015 }, { "epoch": 0.9409227306826706, "grad_norm": 0.8994283676147461, "learning_rate": 2e-05, "loss": 0.5627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15051, "tokens_per_second_per_gpu": 10117.61, "total_tokens": 1485928105 }, { "epoch": 0.9409852463115779, "grad_norm": 0.9226435422897339, "learning_rate": 2e-05, "loss": 0.6416, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15052, "tokens_per_second_per_gpu": 10471.8, "total_tokens": 1486026404 }, { "epoch": 0.9410477619404851, "grad_norm": 0.8672729134559631, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15053, "tokens_per_second_per_gpu": 10745.38, "total_tokens": 1486124938 }, { "epoch": 0.9411102775693924, "grad_norm": 0.9087327718734741, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15054, "tokens_per_second_per_gpu": 10406.57, "total_tokens": 1486224874 }, { "epoch": 0.9411727931982996, "grad_norm": 0.8767484426498413, "learning_rate": 2e-05, "loss": 0.6161, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15055, "tokens_per_second_per_gpu": 10858.5, "total_tokens": 1486323495 }, { "epoch": 0.9412353088272069, "grad_norm": 0.8909875154495239, "learning_rate": 2e-05, "loss": 0.5987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15056, "tokens_per_second_per_gpu": 10035.88, "total_tokens": 1486421502 }, { "epoch": 0.941297824456114, "grad_norm": 0.8996868133544922, "learning_rate": 2e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15057, "tokens_per_second_per_gpu": 9758.84, "total_tokens": 1486519776 }, { "epoch": 0.9413603400850212, "grad_norm": 0.8716163039207458, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15058, "tokens_per_second_per_gpu": 11546.09, "total_tokens": 1486620115 }, { "epoch": 0.9414228557139285, "grad_norm": 0.8943503499031067, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15059, "tokens_per_second_per_gpu": 10371.38, "total_tokens": 1486719380 }, { "epoch": 0.9414853713428357, "grad_norm": 0.9058894515037537, "learning_rate": 2e-05, "loss": 0.5828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15060, "tokens_per_second_per_gpu": 10714.54, "total_tokens": 1486813690 }, { "epoch": 0.941547886971743, "grad_norm": 0.8855950832366943, "learning_rate": 2e-05, "loss": 0.6429, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15061, "tokens_per_second_per_gpu": 10141.44, "total_tokens": 1486912862 }, { "epoch": 0.9416104026006502, "grad_norm": 0.9431714415550232, "learning_rate": 2e-05, "loss": 0.6069, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15062, "tokens_per_second_per_gpu": 9755.01, "total_tokens": 1487008852 }, { "epoch": 0.9416729182295573, "grad_norm": 0.8978708386421204, "learning_rate": 2e-05, "loss": 0.6087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15063, "tokens_per_second_per_gpu": 10678.31, "total_tokens": 1487107732 }, { "epoch": 0.9417354338584646, "grad_norm": 0.8807766437530518, "learning_rate": 2e-05, "loss": 0.6459, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15064, "tokens_per_second_per_gpu": 10876.53, "total_tokens": 1487207665 }, { "epoch": 0.9417979494873718, "grad_norm": 0.858519434928894, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15065, "tokens_per_second_per_gpu": 11573.85, "total_tokens": 1487309725 }, { "epoch": 0.9418604651162791, "grad_norm": 0.9005534648895264, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15066, "tokens_per_second_per_gpu": 10442.09, "total_tokens": 1487407566 }, { "epoch": 0.9419229807451863, "grad_norm": 0.9355185031890869, "learning_rate": 2e-05, "loss": 0.5687, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15067, "tokens_per_second_per_gpu": 10236.78, "total_tokens": 1487502310 }, { "epoch": 0.9419854963740936, "grad_norm": 0.8899062275886536, "learning_rate": 2e-05, "loss": 0.6275, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15068, "tokens_per_second_per_gpu": 10716.09, "total_tokens": 1487605439 }, { "epoch": 0.9420480120030007, "grad_norm": 0.908021867275238, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15069, "tokens_per_second_per_gpu": 9917.27, "total_tokens": 1487704140 }, { "epoch": 0.942110527631908, "grad_norm": 0.9053241014480591, "learning_rate": 2e-05, "loss": 0.6059, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15070, "tokens_per_second_per_gpu": 10556.47, "total_tokens": 1487801004 }, { "epoch": 0.9421730432608152, "grad_norm": 0.8582740426063538, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15071, "tokens_per_second_per_gpu": 11104.91, "total_tokens": 1487902269 }, { "epoch": 0.9422355588897224, "grad_norm": 0.9276375770568848, "learning_rate": 2e-05, "loss": 0.636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15072, "tokens_per_second_per_gpu": 10062.1, "total_tokens": 1488001242 }, { "epoch": 0.9422980745186297, "grad_norm": 0.9261825084686279, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15073, "tokens_per_second_per_gpu": 10684.01, "total_tokens": 1488101981 }, { "epoch": 0.9423605901475369, "grad_norm": 0.9031714200973511, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15074, "tokens_per_second_per_gpu": 9571.21, "total_tokens": 1488196089 }, { "epoch": 0.9424231057764441, "grad_norm": 0.8799786567687988, "learning_rate": 2e-05, "loss": 0.5986, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15075, "tokens_per_second_per_gpu": 10807.92, "total_tokens": 1488297107 }, { "epoch": 0.9424856214053513, "grad_norm": 0.9061189293861389, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15076, "tokens_per_second_per_gpu": 10660.91, "total_tokens": 1488398283 }, { "epoch": 0.9425481370342585, "grad_norm": 0.8969331383705139, "learning_rate": 2e-05, "loss": 0.5979, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15077, "tokens_per_second_per_gpu": 10885.24, "total_tokens": 1488498472 }, { "epoch": 0.9426106526631658, "grad_norm": 0.9064275026321411, "learning_rate": 2e-05, "loss": 0.5885, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15078, "tokens_per_second_per_gpu": 10783.99, "total_tokens": 1488594845 }, { "epoch": 0.942673168292073, "grad_norm": 0.9039790034294128, "learning_rate": 2e-05, "loss": 0.6631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15079, "tokens_per_second_per_gpu": 10576.06, "total_tokens": 1488696530 }, { "epoch": 0.9427356839209803, "grad_norm": 0.8409112691879272, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15080, "tokens_per_second_per_gpu": 11929.05, "total_tokens": 1488801336 }, { "epoch": 0.9427981995498874, "grad_norm": 0.9261804819107056, "learning_rate": 2e-05, "loss": 0.6363, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15081, "tokens_per_second_per_gpu": 10285.27, "total_tokens": 1488901542 }, { "epoch": 0.9428607151787947, "grad_norm": 0.9012933373451233, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15082, "tokens_per_second_per_gpu": 10362.22, "total_tokens": 1488999707 }, { "epoch": 0.9429232308077019, "grad_norm": 0.8574255704879761, "learning_rate": 2e-05, "loss": 0.5836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15083, "tokens_per_second_per_gpu": 10010.29, "total_tokens": 1489099306 }, { "epoch": 0.9429857464366092, "grad_norm": 0.9101992249488831, "learning_rate": 2e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15084, "tokens_per_second_per_gpu": 10255.4, "total_tokens": 1489192680 }, { "epoch": 0.9430482620655164, "grad_norm": 0.8813315629959106, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15085, "tokens_per_second_per_gpu": 11182.44, "total_tokens": 1489295026 }, { "epoch": 0.9431107776944236, "grad_norm": 0.8706498742103577, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15086, "tokens_per_second_per_gpu": 11052.37, "total_tokens": 1489398477 }, { "epoch": 0.9431732933233309, "grad_norm": 0.8967713713645935, "learning_rate": 2e-05, "loss": 0.5981, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15087, "tokens_per_second_per_gpu": 10127.08, "total_tokens": 1489495842 }, { "epoch": 0.943235808952238, "grad_norm": 0.8535625338554382, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15088, "tokens_per_second_per_gpu": 10944.43, "total_tokens": 1489599203 }, { "epoch": 0.9432983245811453, "grad_norm": 0.8732678294181824, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15089, "tokens_per_second_per_gpu": 10365.78, "total_tokens": 1489698183 }, { "epoch": 0.9433608402100525, "grad_norm": 0.8774877190589905, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15090, "tokens_per_second_per_gpu": 10566.77, "total_tokens": 1489797427 }, { "epoch": 0.9434233558389598, "grad_norm": 0.8796140551567078, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15091, "tokens_per_second_per_gpu": 9671.89, "total_tokens": 1489892389 }, { "epoch": 0.943485871467867, "grad_norm": 0.883033812046051, "learning_rate": 2e-05, "loss": 0.6075, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15092, "tokens_per_second_per_gpu": 10186.19, "total_tokens": 1489988874 }, { "epoch": 0.9435483870967742, "grad_norm": 0.8614593744277954, "learning_rate": 2e-05, "loss": 0.6143, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15093, "tokens_per_second_per_gpu": 10428.94, "total_tokens": 1490088931 }, { "epoch": 0.9436109027256814, "grad_norm": 0.8925182819366455, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15094, "tokens_per_second_per_gpu": 10079.53, "total_tokens": 1490185341 }, { "epoch": 0.9436734183545886, "grad_norm": 0.921409547328949, "learning_rate": 2e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15095, "tokens_per_second_per_gpu": 10546.23, "total_tokens": 1490284355 }, { "epoch": 0.9437359339834959, "grad_norm": 0.8923031091690063, "learning_rate": 2e-05, "loss": 0.6141, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15096, "tokens_per_second_per_gpu": 10708.95, "total_tokens": 1490386668 }, { "epoch": 0.9437984496124031, "grad_norm": 0.8850388526916504, "learning_rate": 2e-05, "loss": 0.5897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15097, "tokens_per_second_per_gpu": 10257.4, "total_tokens": 1490481366 }, { "epoch": 0.9438609652413104, "grad_norm": 0.9133955836296082, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15098, "tokens_per_second_per_gpu": 10705.74, "total_tokens": 1490583043 }, { "epoch": 0.9439234808702176, "grad_norm": 0.9293316602706909, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15099, "tokens_per_second_per_gpu": 9671.25, "total_tokens": 1490678163 }, { "epoch": 0.9439859964991247, "grad_norm": 0.9128918051719666, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15100, "tokens_per_second_per_gpu": 10307.09, "total_tokens": 1490774524 }, { "epoch": 0.944048512128032, "grad_norm": 0.9102985262870789, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15101, "tokens_per_second_per_gpu": 10364.21, "total_tokens": 1490872333 }, { "epoch": 0.9441110277569392, "grad_norm": 0.8614866733551025, "learning_rate": 2e-05, "loss": 0.5858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15102, "tokens_per_second_per_gpu": 9682.94, "total_tokens": 1490969424 }, { "epoch": 0.9441735433858465, "grad_norm": 0.8887973427772522, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15103, "tokens_per_second_per_gpu": 10472.02, "total_tokens": 1491067304 }, { "epoch": 0.9442360590147537, "grad_norm": 0.9002540111541748, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15104, "tokens_per_second_per_gpu": 10247.18, "total_tokens": 1491163126 }, { "epoch": 0.944298574643661, "grad_norm": 0.8580884337425232, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15105, "tokens_per_second_per_gpu": 11219.01, "total_tokens": 1491266628 }, { "epoch": 0.9443610902725681, "grad_norm": 0.9316367506980896, "learning_rate": 2e-05, "loss": 0.6325, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15106, "tokens_per_second_per_gpu": 11092.23, "total_tokens": 1491368270 }, { "epoch": 0.9444236059014753, "grad_norm": 0.8785289525985718, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15107, "tokens_per_second_per_gpu": 10842.59, "total_tokens": 1491467547 }, { "epoch": 0.9444861215303826, "grad_norm": 0.8992339372634888, "learning_rate": 2e-05, "loss": 0.6178, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15108, "tokens_per_second_per_gpu": 10501.84, "total_tokens": 1491566147 }, { "epoch": 0.9445486371592898, "grad_norm": 0.9057101011276245, "learning_rate": 2e-05, "loss": 0.5514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15109, "tokens_per_second_per_gpu": 10182.48, "total_tokens": 1491659440 }, { "epoch": 0.9446111527881971, "grad_norm": 0.8825489282608032, "learning_rate": 2e-05, "loss": 0.5294, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15110, "tokens_per_second_per_gpu": 9003.08, "total_tokens": 1491751481 }, { "epoch": 0.9446736684171043, "grad_norm": 0.8770412802696228, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15111, "tokens_per_second_per_gpu": 11238.25, "total_tokens": 1491854777 }, { "epoch": 0.9447361840460115, "grad_norm": 0.9361005425453186, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15112, "tokens_per_second_per_gpu": 10432.82, "total_tokens": 1491955015 }, { "epoch": 0.9447986996749187, "grad_norm": 0.9105454683303833, "learning_rate": 2e-05, "loss": 0.6658, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15113, "tokens_per_second_per_gpu": 11145.96, "total_tokens": 1492057233 }, { "epoch": 0.9448612153038259, "grad_norm": 0.9015847444534302, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15114, "tokens_per_second_per_gpu": 10742.57, "total_tokens": 1492154634 }, { "epoch": 0.9449237309327332, "grad_norm": 0.912396252155304, "learning_rate": 2e-05, "loss": 0.5894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15115, "tokens_per_second_per_gpu": 10644.62, "total_tokens": 1492249563 }, { "epoch": 0.9449862465616404, "grad_norm": 0.8978230357170105, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15116, "tokens_per_second_per_gpu": 10702.93, "total_tokens": 1492350357 }, { "epoch": 0.9450487621905477, "grad_norm": 0.8896271586418152, "learning_rate": 2e-05, "loss": 0.597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15117, "tokens_per_second_per_gpu": 10506.39, "total_tokens": 1492448682 }, { "epoch": 0.9451112778194548, "grad_norm": 0.8936354517936707, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15118, "tokens_per_second_per_gpu": 10957.87, "total_tokens": 1492547223 }, { "epoch": 0.945173793448362, "grad_norm": 0.8770129680633545, "learning_rate": 2e-05, "loss": 0.6085, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15119, "tokens_per_second_per_gpu": 10592.14, "total_tokens": 1492647059 }, { "epoch": 0.9452363090772693, "grad_norm": 0.919316828250885, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15120, "tokens_per_second_per_gpu": 10259.23, "total_tokens": 1492741531 }, { "epoch": 0.9452988247061765, "grad_norm": 0.8368552923202515, "learning_rate": 2e-05, "loss": 0.5742, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15121, "tokens_per_second_per_gpu": 10601.18, "total_tokens": 1492840844 }, { "epoch": 0.9453613403350838, "grad_norm": 0.8518938422203064, "learning_rate": 2e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15122, "tokens_per_second_per_gpu": 10485.13, "total_tokens": 1492943183 }, { "epoch": 0.945423855963991, "grad_norm": 0.8765920996665955, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15123, "tokens_per_second_per_gpu": 9808.15, "total_tokens": 1493040004 }, { "epoch": 0.9454863715928983, "grad_norm": 0.9262417554855347, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15124, "tokens_per_second_per_gpu": 10768.99, "total_tokens": 1493140796 }, { "epoch": 0.9455488872218054, "grad_norm": 0.8846575021743774, "learning_rate": 2e-05, "loss": 0.5911, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15125, "tokens_per_second_per_gpu": 9573.61, "total_tokens": 1493233329 }, { "epoch": 0.9456114028507127, "grad_norm": 0.9537791609764099, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15126, "tokens_per_second_per_gpu": 10901.44, "total_tokens": 1493336073 }, { "epoch": 0.9456739184796199, "grad_norm": 0.8680532574653625, "learning_rate": 2e-05, "loss": 0.5993, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15127, "tokens_per_second_per_gpu": 11262.63, "total_tokens": 1493438553 }, { "epoch": 0.9457364341085271, "grad_norm": 0.8636674880981445, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15128, "tokens_per_second_per_gpu": 10259.92, "total_tokens": 1493535386 }, { "epoch": 0.9457989497374344, "grad_norm": 0.9228770732879639, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15129, "tokens_per_second_per_gpu": 10393.85, "total_tokens": 1493628271 }, { "epoch": 0.9458614653663416, "grad_norm": 0.9073530435562134, "learning_rate": 2e-05, "loss": 0.6225, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15130, "tokens_per_second_per_gpu": 10626.12, "total_tokens": 1493731080 }, { "epoch": 0.9459239809952488, "grad_norm": 0.8668636083602905, "learning_rate": 2e-05, "loss": 0.6198, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15131, "tokens_per_second_per_gpu": 11024.58, "total_tokens": 1493831459 }, { "epoch": 0.945986496624156, "grad_norm": 0.8669145107269287, "learning_rate": 2e-05, "loss": 0.6505, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15132, "tokens_per_second_per_gpu": 10881.79, "total_tokens": 1493935245 }, { "epoch": 0.9460490122530633, "grad_norm": 0.8710184097290039, "learning_rate": 2e-05, "loss": 0.6459, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15133, "tokens_per_second_per_gpu": 10742.56, "total_tokens": 1494036695 }, { "epoch": 0.9461115278819705, "grad_norm": 0.8467453122138977, "learning_rate": 2e-05, "loss": 0.5873, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15134, "tokens_per_second_per_gpu": 10137.5, "total_tokens": 1494139481 }, { "epoch": 0.9461740435108777, "grad_norm": 0.8563389778137207, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15135, "tokens_per_second_per_gpu": 10871.87, "total_tokens": 1494239874 }, { "epoch": 0.946236559139785, "grad_norm": 0.8966431617736816, "learning_rate": 2e-05, "loss": 0.5714, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15136, "tokens_per_second_per_gpu": 9791.79, "total_tokens": 1494333694 }, { "epoch": 0.9462990747686921, "grad_norm": 0.8725934028625488, "learning_rate": 2e-05, "loss": 0.5962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15137, "tokens_per_second_per_gpu": 11100.94, "total_tokens": 1494432396 }, { "epoch": 0.9463615903975994, "grad_norm": 0.8749847412109375, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15138, "tokens_per_second_per_gpu": 11430.29, "total_tokens": 1494532740 }, { "epoch": 0.9464241060265066, "grad_norm": 0.8970205187797546, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15139, "tokens_per_second_per_gpu": 10307.8, "total_tokens": 1494628366 }, { "epoch": 0.9464866216554139, "grad_norm": 0.8792722225189209, "learning_rate": 2e-05, "loss": 0.6048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15140, "tokens_per_second_per_gpu": 10764.04, "total_tokens": 1494726479 }, { "epoch": 0.9465491372843211, "grad_norm": 0.9189568161964417, "learning_rate": 2e-05, "loss": 0.5862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15141, "tokens_per_second_per_gpu": 10402.74, "total_tokens": 1494825239 }, { "epoch": 0.9466116529132284, "grad_norm": 0.8836260437965393, "learning_rate": 2e-05, "loss": 0.6465, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15142, "tokens_per_second_per_gpu": 10325.07, "total_tokens": 1494922630 }, { "epoch": 0.9466741685421355, "grad_norm": 0.9165393710136414, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15143, "tokens_per_second_per_gpu": 10573.16, "total_tokens": 1495021061 }, { "epoch": 0.9467366841710427, "grad_norm": 0.8786934018135071, "learning_rate": 2e-05, "loss": 0.6445, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15144, "tokens_per_second_per_gpu": 10956.76, "total_tokens": 1495124273 }, { "epoch": 0.94679919979995, "grad_norm": 0.8800522089004517, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15145, "tokens_per_second_per_gpu": 10583.65, "total_tokens": 1495223064 }, { "epoch": 0.9468617154288572, "grad_norm": 0.848336935043335, "learning_rate": 2e-05, "loss": 0.6, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15146, "tokens_per_second_per_gpu": 10969.84, "total_tokens": 1495326023 }, { "epoch": 0.9469242310577645, "grad_norm": 0.8786110281944275, "learning_rate": 2e-05, "loss": 0.651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15147, "tokens_per_second_per_gpu": 11385.34, "total_tokens": 1495430928 }, { "epoch": 0.9469867466866717, "grad_norm": 0.8891037702560425, "learning_rate": 2e-05, "loss": 0.6118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15148, "tokens_per_second_per_gpu": 10797.12, "total_tokens": 1495532380 }, { "epoch": 0.9470492623155788, "grad_norm": 0.8660102486610413, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15149, "tokens_per_second_per_gpu": 10899.14, "total_tokens": 1495635072 }, { "epoch": 0.9471117779444861, "grad_norm": 0.8802201747894287, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15150, "tokens_per_second_per_gpu": 10807.66, "total_tokens": 1495732287 }, { "epoch": 0.9471742935733933, "grad_norm": 0.9290359020233154, "learning_rate": 2e-05, "loss": 0.6481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15151, "tokens_per_second_per_gpu": 11244.96, "total_tokens": 1495836879 }, { "epoch": 0.9472368092023006, "grad_norm": 0.9429228901863098, "learning_rate": 2e-05, "loss": 0.6087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15152, "tokens_per_second_per_gpu": 10618.77, "total_tokens": 1495931872 }, { "epoch": 0.9472993248312078, "grad_norm": 0.8761871457099915, "learning_rate": 2e-05, "loss": 0.585, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15153, "tokens_per_second_per_gpu": 10199.7, "total_tokens": 1496029571 }, { "epoch": 0.9473618404601151, "grad_norm": 0.894080638885498, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15154, "tokens_per_second_per_gpu": 10109.51, "total_tokens": 1496129423 }, { "epoch": 0.9474243560890222, "grad_norm": 0.8711718320846558, "learning_rate": 2e-05, "loss": 0.6207, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15155, "tokens_per_second_per_gpu": 10946.38, "total_tokens": 1496232737 }, { "epoch": 0.9474868717179294, "grad_norm": 0.911320686340332, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15156, "tokens_per_second_per_gpu": 10299.7, "total_tokens": 1496329323 }, { "epoch": 0.9475493873468367, "grad_norm": 0.8634622693061829, "learning_rate": 2e-05, "loss": 0.5764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15157, "tokens_per_second_per_gpu": 10819.63, "total_tokens": 1496428660 }, { "epoch": 0.9476119029757439, "grad_norm": 0.8599872589111328, "learning_rate": 2e-05, "loss": 0.5857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15158, "tokens_per_second_per_gpu": 10448.79, "total_tokens": 1496526689 }, { "epoch": 0.9476744186046512, "grad_norm": 0.9207242131233215, "learning_rate": 2e-05, "loss": 0.6229, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15159, "tokens_per_second_per_gpu": 10198.54, "total_tokens": 1496624847 }, { "epoch": 0.9477369342335584, "grad_norm": 0.8713161945343018, "learning_rate": 2e-05, "loss": 0.6118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15160, "tokens_per_second_per_gpu": 11051.33, "total_tokens": 1496727541 }, { "epoch": 0.9477994498624657, "grad_norm": 0.9040536284446716, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15161, "tokens_per_second_per_gpu": 10803.81, "total_tokens": 1496828873 }, { "epoch": 0.9478619654913728, "grad_norm": 0.8500233888626099, "learning_rate": 2e-05, "loss": 0.5914, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15162, "tokens_per_second_per_gpu": 11205.74, "total_tokens": 1496933175 }, { "epoch": 0.94792448112028, "grad_norm": 0.8694100975990295, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15163, "tokens_per_second_per_gpu": 10812.94, "total_tokens": 1497034344 }, { "epoch": 0.9479869967491873, "grad_norm": 0.9266387820243835, "learning_rate": 2e-05, "loss": 0.6648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15164, "tokens_per_second_per_gpu": 10149.25, "total_tokens": 1497133043 }, { "epoch": 0.9480495123780945, "grad_norm": 0.88747638463974, "learning_rate": 2e-05, "loss": 0.5894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15165, "tokens_per_second_per_gpu": 10581.36, "total_tokens": 1497233003 }, { "epoch": 0.9481120280070018, "grad_norm": 0.8981761336326599, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15166, "tokens_per_second_per_gpu": 11405.93, "total_tokens": 1497333133 }, { "epoch": 0.948174543635909, "grad_norm": 0.898940920829773, "learning_rate": 2e-05, "loss": 0.6123, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15167, "tokens_per_second_per_gpu": 10512.77, "total_tokens": 1497434377 }, { "epoch": 0.9482370592648162, "grad_norm": 0.8890893459320068, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15168, "tokens_per_second_per_gpu": 10477.0, "total_tokens": 1497533031 }, { "epoch": 0.9482995748937234, "grad_norm": 0.8475194573402405, "learning_rate": 2e-05, "loss": 0.5934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15169, "tokens_per_second_per_gpu": 11002.87, "total_tokens": 1497635744 }, { "epoch": 0.9483620905226307, "grad_norm": 0.8904467821121216, "learning_rate": 2e-05, "loss": 0.5895, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15170, "tokens_per_second_per_gpu": 11050.7, "total_tokens": 1497736176 }, { "epoch": 0.9484246061515379, "grad_norm": 0.9326315522193909, "learning_rate": 2e-05, "loss": 0.6406, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15171, "tokens_per_second_per_gpu": 10908.75, "total_tokens": 1497836751 }, { "epoch": 0.9484871217804451, "grad_norm": 0.917857825756073, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15172, "tokens_per_second_per_gpu": 10604.76, "total_tokens": 1497939746 }, { "epoch": 0.9485496374093524, "grad_norm": 0.9168757796287537, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15173, "tokens_per_second_per_gpu": 10939.38, "total_tokens": 1498041947 }, { "epoch": 0.9486121530382595, "grad_norm": 0.8840323090553284, "learning_rate": 2e-05, "loss": 0.6046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15174, "tokens_per_second_per_gpu": 9878.89, "total_tokens": 1498141017 }, { "epoch": 0.9486746686671668, "grad_norm": 0.8850911259651184, "learning_rate": 2e-05, "loss": 0.5766, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15175, "tokens_per_second_per_gpu": 10285.23, "total_tokens": 1498238752 }, { "epoch": 0.948737184296074, "grad_norm": 0.8954099416732788, "learning_rate": 2e-05, "loss": 0.6292, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15176, "tokens_per_second_per_gpu": 10001.94, "total_tokens": 1498334384 }, { "epoch": 0.9487996999249813, "grad_norm": 0.9044909477233887, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15177, "tokens_per_second_per_gpu": 10487.05, "total_tokens": 1498432443 }, { "epoch": 0.9488622155538885, "grad_norm": 0.8865482807159424, "learning_rate": 2e-05, "loss": 0.6061, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15178, "tokens_per_second_per_gpu": 10716.15, "total_tokens": 1498531739 }, { "epoch": 0.9489247311827957, "grad_norm": 0.8873329162597656, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15179, "tokens_per_second_per_gpu": 10261.93, "total_tokens": 1498630402 }, { "epoch": 0.9489872468117029, "grad_norm": 0.9160929322242737, "learning_rate": 2e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15180, "tokens_per_second_per_gpu": 10397.46, "total_tokens": 1498728655 }, { "epoch": 0.9490497624406101, "grad_norm": 0.8608705401420593, "learning_rate": 2e-05, "loss": 0.6064, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15181, "tokens_per_second_per_gpu": 10471.92, "total_tokens": 1498829349 }, { "epoch": 0.9491122780695174, "grad_norm": 0.8728352785110474, "learning_rate": 2e-05, "loss": 0.5817, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15182, "tokens_per_second_per_gpu": 10426.99, "total_tokens": 1498927259 }, { "epoch": 0.9491747936984246, "grad_norm": 0.9096529483795166, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15183, "tokens_per_second_per_gpu": 10488.59, "total_tokens": 1499026481 }, { "epoch": 0.9492373093273319, "grad_norm": 0.8942201733589172, "learning_rate": 2e-05, "loss": 0.645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15184, "tokens_per_second_per_gpu": 10591.25, "total_tokens": 1499126625 }, { "epoch": 0.9492998249562391, "grad_norm": 0.9087256789207458, "learning_rate": 2e-05, "loss": 0.5746, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15185, "tokens_per_second_per_gpu": 10131.03, "total_tokens": 1499226119 }, { "epoch": 0.9493623405851462, "grad_norm": 0.9263210296630859, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15186, "tokens_per_second_per_gpu": 10285.63, "total_tokens": 1499321204 }, { "epoch": 0.9494248562140535, "grad_norm": 0.8865593075752258, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15187, "tokens_per_second_per_gpu": 10168.75, "total_tokens": 1499419706 }, { "epoch": 0.9494873718429607, "grad_norm": 0.896479606628418, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15188, "tokens_per_second_per_gpu": 10769.7, "total_tokens": 1499517359 }, { "epoch": 0.949549887471868, "grad_norm": 0.8913639187812805, "learning_rate": 2e-05, "loss": 0.581, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15189, "tokens_per_second_per_gpu": 10023.09, "total_tokens": 1499614823 }, { "epoch": 0.9496124031007752, "grad_norm": 0.907427966594696, "learning_rate": 2e-05, "loss": 0.5847, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15190, "tokens_per_second_per_gpu": 9932.96, "total_tokens": 1499711174 }, { "epoch": 0.9496749187296825, "grad_norm": 0.8599076271057129, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15191, "tokens_per_second_per_gpu": 10727.47, "total_tokens": 1499809047 }, { "epoch": 0.9497374343585896, "grad_norm": 0.8800089955329895, "learning_rate": 2e-05, "loss": 0.5733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15192, "tokens_per_second_per_gpu": 10170.17, "total_tokens": 1499907066 }, { "epoch": 0.9497999499874968, "grad_norm": 0.8862232565879822, "learning_rate": 2e-05, "loss": 0.6063, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15193, "tokens_per_second_per_gpu": 10769.82, "total_tokens": 1500008818 }, { "epoch": 0.9498624656164041, "grad_norm": 0.9118852019309998, "learning_rate": 2e-05, "loss": 0.6739, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15194, "tokens_per_second_per_gpu": 10326.92, "total_tokens": 1500111155 }, { "epoch": 0.9499249812453113, "grad_norm": 0.9210751056671143, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15195, "tokens_per_second_per_gpu": 10402.59, "total_tokens": 1500208981 }, { "epoch": 0.9499874968742186, "grad_norm": 0.837070643901825, "learning_rate": 2e-05, "loss": 0.567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15196, "tokens_per_second_per_gpu": 10652.26, "total_tokens": 1500309834 }, { "epoch": 0.9500500125031258, "grad_norm": 0.8715471029281616, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15197, "tokens_per_second_per_gpu": 10945.27, "total_tokens": 1500413466 }, { "epoch": 0.9501125281320331, "grad_norm": 0.9262584447860718, "learning_rate": 2e-05, "loss": 0.6412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15198, "tokens_per_second_per_gpu": 9140.52, "total_tokens": 1500507585 }, { "epoch": 0.9501750437609402, "grad_norm": 0.862174391746521, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15199, "tokens_per_second_per_gpu": 11653.26, "total_tokens": 1500614796 }, { "epoch": 0.9502375593898474, "grad_norm": 0.90572190284729, "learning_rate": 2e-05, "loss": 0.6039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15200, "tokens_per_second_per_gpu": 10350.37, "total_tokens": 1500711910 }, { "epoch": 0.9503000750187547, "grad_norm": 0.9229933619499207, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15201, "tokens_per_second_per_gpu": 11060.64, "total_tokens": 1500812543 }, { "epoch": 0.9503625906476619, "grad_norm": 0.9182895421981812, "learning_rate": 2e-05, "loss": 0.5836, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15202, "tokens_per_second_per_gpu": 10037.8, "total_tokens": 1500910802 }, { "epoch": 0.9504251062765692, "grad_norm": 0.8943873047828674, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15203, "tokens_per_second_per_gpu": 10110.12, "total_tokens": 1501008869 }, { "epoch": 0.9504876219054764, "grad_norm": 0.8936648368835449, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15204, "tokens_per_second_per_gpu": 10886.82, "total_tokens": 1501109235 }, { "epoch": 0.9505501375343836, "grad_norm": 0.8507580161094666, "learning_rate": 2e-05, "loss": 0.6108, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15205, "tokens_per_second_per_gpu": 10070.04, "total_tokens": 1501206148 }, { "epoch": 0.9506126531632908, "grad_norm": 0.9394242167472839, "learning_rate": 2e-05, "loss": 0.6481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15206, "tokens_per_second_per_gpu": 10878.5, "total_tokens": 1501305512 }, { "epoch": 0.950675168792198, "grad_norm": 0.8537867069244385, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15207, "tokens_per_second_per_gpu": 10425.68, "total_tokens": 1501408825 }, { "epoch": 0.9507376844211053, "grad_norm": 0.9353212714195251, "learning_rate": 2e-05, "loss": 0.6371, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15208, "tokens_per_second_per_gpu": 10756.33, "total_tokens": 1501509933 }, { "epoch": 0.9508002000500125, "grad_norm": 0.9230219125747681, "learning_rate": 2e-05, "loss": 0.6884, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15209, "tokens_per_second_per_gpu": 10819.35, "total_tokens": 1501610311 }, { "epoch": 0.9508627156789198, "grad_norm": 0.8783308267593384, "learning_rate": 2e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15210, "tokens_per_second_per_gpu": 10271.83, "total_tokens": 1501706339 }, { "epoch": 0.9509252313078269, "grad_norm": 0.9175734519958496, "learning_rate": 2e-05, "loss": 0.5722, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15211, "tokens_per_second_per_gpu": 9141.05, "total_tokens": 1501798365 }, { "epoch": 0.9509877469367342, "grad_norm": 0.8974765539169312, "learning_rate": 2e-05, "loss": 0.6115, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15212, "tokens_per_second_per_gpu": 10880.85, "total_tokens": 1501897859 }, { "epoch": 0.9510502625656414, "grad_norm": 0.8912819027900696, "learning_rate": 2e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15213, "tokens_per_second_per_gpu": 10276.56, "total_tokens": 1501994014 }, { "epoch": 0.9511127781945486, "grad_norm": 0.9044928550720215, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15214, "tokens_per_second_per_gpu": 10459.28, "total_tokens": 1502093381 }, { "epoch": 0.9511752938234559, "grad_norm": 0.8855650424957275, "learning_rate": 2e-05, "loss": 0.6562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15215, "tokens_per_second_per_gpu": 11040.9, "total_tokens": 1502194690 }, { "epoch": 0.9512378094523631, "grad_norm": 0.8624928593635559, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15216, "tokens_per_second_per_gpu": 10666.99, "total_tokens": 1502296758 }, { "epoch": 0.9513003250812703, "grad_norm": 0.899940550327301, "learning_rate": 2e-05, "loss": 0.6185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15217, "tokens_per_second_per_gpu": 9833.53, "total_tokens": 1502390820 }, { "epoch": 0.9513628407101775, "grad_norm": 0.894500195980072, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15218, "tokens_per_second_per_gpu": 10120.64, "total_tokens": 1502489786 }, { "epoch": 0.9514253563390848, "grad_norm": 0.8874714970588684, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15219, "tokens_per_second_per_gpu": 10873.53, "total_tokens": 1502588193 }, { "epoch": 0.951487871967992, "grad_norm": 0.8769205808639526, "learning_rate": 2e-05, "loss": 0.594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15220, "tokens_per_second_per_gpu": 9986.71, "total_tokens": 1502686492 }, { "epoch": 0.9515503875968992, "grad_norm": 0.8705939650535583, "learning_rate": 2e-05, "loss": 0.5735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15221, "tokens_per_second_per_gpu": 10161.36, "total_tokens": 1502784749 }, { "epoch": 0.9516129032258065, "grad_norm": 0.9052197337150574, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15222, "tokens_per_second_per_gpu": 10644.31, "total_tokens": 1502886124 }, { "epoch": 0.9516754188547136, "grad_norm": 0.9246733784675598, "learning_rate": 2e-05, "loss": 0.5953, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15223, "tokens_per_second_per_gpu": 10320.02, "total_tokens": 1502987010 }, { "epoch": 0.9517379344836209, "grad_norm": 0.9142485857009888, "learning_rate": 2e-05, "loss": 0.5522, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15224, "tokens_per_second_per_gpu": 10041.87, "total_tokens": 1503080646 }, { "epoch": 0.9518004501125281, "grad_norm": 0.8694064617156982, "learning_rate": 2e-05, "loss": 0.5862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15225, "tokens_per_second_per_gpu": 11156.91, "total_tokens": 1503180691 }, { "epoch": 0.9518629657414354, "grad_norm": 0.8658676147460938, "learning_rate": 2e-05, "loss": 0.6105, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15226, "tokens_per_second_per_gpu": 11033.76, "total_tokens": 1503279769 }, { "epoch": 0.9519254813703426, "grad_norm": 0.9398990273475647, "learning_rate": 2e-05, "loss": 0.6556, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15227, "tokens_per_second_per_gpu": 10370.87, "total_tokens": 1503377971 }, { "epoch": 0.9519879969992499, "grad_norm": 0.8852346539497375, "learning_rate": 2e-05, "loss": 0.5764, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15228, "tokens_per_second_per_gpu": 12854.41, "total_tokens": 1503468870 }, { "epoch": 0.952050512628157, "grad_norm": 0.9123375415802002, "learning_rate": 2e-05, "loss": 0.605, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15229, "tokens_per_second_per_gpu": 11207.89, "total_tokens": 1503569148 }, { "epoch": 0.9521130282570642, "grad_norm": 0.9081071019172668, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15230, "tokens_per_second_per_gpu": 10593.37, "total_tokens": 1503667978 }, { "epoch": 0.9521755438859715, "grad_norm": 0.9213401675224304, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15231, "tokens_per_second_per_gpu": 10273.44, "total_tokens": 1503766379 }, { "epoch": 0.9522380595148787, "grad_norm": 0.9045662879943848, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15232, "tokens_per_second_per_gpu": 10133.37, "total_tokens": 1503865409 }, { "epoch": 0.952300575143786, "grad_norm": 0.8653702139854431, "learning_rate": 2e-05, "loss": 0.661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15233, "tokens_per_second_per_gpu": 10864.94, "total_tokens": 1503968908 }, { "epoch": 0.9523630907726932, "grad_norm": 0.9107160568237305, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15234, "tokens_per_second_per_gpu": 10065.97, "total_tokens": 1504067779 }, { "epoch": 0.9524256064016005, "grad_norm": 0.8893353939056396, "learning_rate": 2e-05, "loss": 0.6501, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15235, "tokens_per_second_per_gpu": 10601.41, "total_tokens": 1504167375 }, { "epoch": 0.9524881220305076, "grad_norm": 0.9456345438957214, "learning_rate": 2e-05, "loss": 0.6252, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15236, "tokens_per_second_per_gpu": 10879.21, "total_tokens": 1504266586 }, { "epoch": 0.9525506376594148, "grad_norm": 0.920330286026001, "learning_rate": 2e-05, "loss": 0.6301, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15237, "tokens_per_second_per_gpu": 10706.87, "total_tokens": 1504366622 }, { "epoch": 0.9526131532883221, "grad_norm": 0.8721843957901001, "learning_rate": 2e-05, "loss": 0.5726, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15238, "tokens_per_second_per_gpu": 9934.26, "total_tokens": 1504463059 }, { "epoch": 0.9526756689172293, "grad_norm": 0.8682039976119995, "learning_rate": 2e-05, "loss": 0.5863, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15239, "tokens_per_second_per_gpu": 10402.61, "total_tokens": 1504562833 }, { "epoch": 0.9527381845461366, "grad_norm": 0.8813186883926392, "learning_rate": 2e-05, "loss": 0.5845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15240, "tokens_per_second_per_gpu": 10763.0, "total_tokens": 1504664768 }, { "epoch": 0.9528007001750438, "grad_norm": 0.9258637428283691, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15241, "tokens_per_second_per_gpu": 9518.75, "total_tokens": 1504758033 }, { "epoch": 0.952863215803951, "grad_norm": 0.8982178568840027, "learning_rate": 2e-05, "loss": 0.6103, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15242, "tokens_per_second_per_gpu": 10776.54, "total_tokens": 1504862755 }, { "epoch": 0.9529257314328582, "grad_norm": 0.9190592169761658, "learning_rate": 2e-05, "loss": 0.5866, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15243, "tokens_per_second_per_gpu": 9847.33, "total_tokens": 1504959651 }, { "epoch": 0.9529882470617654, "grad_norm": 0.9737512469291687, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15244, "tokens_per_second_per_gpu": 10039.28, "total_tokens": 1505058723 }, { "epoch": 0.9530507626906727, "grad_norm": 0.8804993033409119, "learning_rate": 2e-05, "loss": 0.5801, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15245, "tokens_per_second_per_gpu": 10084.51, "total_tokens": 1505157220 }, { "epoch": 0.9531132783195799, "grad_norm": 0.8434134721755981, "learning_rate": 2e-05, "loss": 0.5805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15246, "tokens_per_second_per_gpu": 10851.34, "total_tokens": 1505254369 }, { "epoch": 0.9531757939484872, "grad_norm": 0.8912768959999084, "learning_rate": 2e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15247, "tokens_per_second_per_gpu": 9649.71, "total_tokens": 1505349089 }, { "epoch": 0.9532383095773943, "grad_norm": 0.9096331596374512, "learning_rate": 2e-05, "loss": 0.6421, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15248, "tokens_per_second_per_gpu": 10798.86, "total_tokens": 1505447468 }, { "epoch": 0.9533008252063015, "grad_norm": 0.8792383074760437, "learning_rate": 2e-05, "loss": 0.6112, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15249, "tokens_per_second_per_gpu": 10891.9, "total_tokens": 1505548148 }, { "epoch": 0.9533633408352088, "grad_norm": 0.8640821576118469, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15250, "tokens_per_second_per_gpu": 10795.58, "total_tokens": 1505648288 }, { "epoch": 0.953425856464116, "grad_norm": 0.9829953908920288, "learning_rate": 2e-05, "loss": 0.6071, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15251, "tokens_per_second_per_gpu": 11107.43, "total_tokens": 1505750854 }, { "epoch": 0.9534883720930233, "grad_norm": 0.9128183722496033, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15252, "tokens_per_second_per_gpu": 10980.86, "total_tokens": 1505852945 }, { "epoch": 0.9535508877219305, "grad_norm": 0.895182728767395, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15253, "tokens_per_second_per_gpu": 11104.64, "total_tokens": 1505955155 }, { "epoch": 0.9536134033508377, "grad_norm": 0.9537782073020935, "learning_rate": 2e-05, "loss": 0.634, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15254, "tokens_per_second_per_gpu": 10779.04, "total_tokens": 1506054456 }, { "epoch": 0.9536759189797449, "grad_norm": 0.9324855804443359, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15255, "tokens_per_second_per_gpu": 9538.75, "total_tokens": 1506145845 }, { "epoch": 0.9537384346086522, "grad_norm": 0.8938472270965576, "learning_rate": 2e-05, "loss": 0.5839, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15256, "tokens_per_second_per_gpu": 10429.52, "total_tokens": 1506242995 }, { "epoch": 0.9538009502375594, "grad_norm": 0.9221652150154114, "learning_rate": 2e-05, "loss": 0.6009, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15257, "tokens_per_second_per_gpu": 10698.73, "total_tokens": 1506340658 }, { "epoch": 0.9538634658664666, "grad_norm": 0.9145812392234802, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15258, "tokens_per_second_per_gpu": 11395.38, "total_tokens": 1506441086 }, { "epoch": 0.9539259814953739, "grad_norm": 0.9213876128196716, "learning_rate": 2e-05, "loss": 0.5824, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15259, "tokens_per_second_per_gpu": 9654.37, "total_tokens": 1506533332 }, { "epoch": 0.953988497124281, "grad_norm": 0.9426336288452148, "learning_rate": 2e-05, "loss": 0.6156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15260, "tokens_per_second_per_gpu": 10053.61, "total_tokens": 1506626706 }, { "epoch": 0.9540510127531883, "grad_norm": 0.8719676733016968, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15261, "tokens_per_second_per_gpu": 10726.84, "total_tokens": 1506725971 }, { "epoch": 0.9541135283820955, "grad_norm": 0.9098145961761475, "learning_rate": 2e-05, "loss": 0.5805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15262, "tokens_per_second_per_gpu": 9961.09, "total_tokens": 1506820546 }, { "epoch": 0.9541760440110028, "grad_norm": 0.8952146768569946, "learning_rate": 2e-05, "loss": 0.5651, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15263, "tokens_per_second_per_gpu": 10614.38, "total_tokens": 1506919629 }, { "epoch": 0.95423855963991, "grad_norm": 0.9201188087463379, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15264, "tokens_per_second_per_gpu": 10269.59, "total_tokens": 1507017660 }, { "epoch": 0.9543010752688172, "grad_norm": 0.8887618780136108, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15265, "tokens_per_second_per_gpu": 10581.23, "total_tokens": 1507118434 }, { "epoch": 0.9543635908977244, "grad_norm": 0.8867547512054443, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15266, "tokens_per_second_per_gpu": 10361.49, "total_tokens": 1507217618 }, { "epoch": 0.9544261065266316, "grad_norm": 0.8681140542030334, "learning_rate": 2e-05, "loss": 0.6201, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15267, "tokens_per_second_per_gpu": 10743.82, "total_tokens": 1507316142 }, { "epoch": 0.9544886221555389, "grad_norm": 0.9016464948654175, "learning_rate": 2e-05, "loss": 0.6074, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15268, "tokens_per_second_per_gpu": 10668.88, "total_tokens": 1507414773 }, { "epoch": 0.9545511377844461, "grad_norm": 0.911989152431488, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15269, "tokens_per_second_per_gpu": 10255.09, "total_tokens": 1507516492 }, { "epoch": 0.9546136534133534, "grad_norm": 0.8977295160293579, "learning_rate": 2e-05, "loss": 0.5818, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15270, "tokens_per_second_per_gpu": 11007.96, "total_tokens": 1507616095 }, { "epoch": 0.9546761690422606, "grad_norm": 0.909606397151947, "learning_rate": 2e-05, "loss": 0.6067, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15271, "tokens_per_second_per_gpu": 10556.44, "total_tokens": 1507717271 }, { "epoch": 0.9547386846711677, "grad_norm": 0.8580650687217712, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15272, "tokens_per_second_per_gpu": 11260.19, "total_tokens": 1507820973 }, { "epoch": 0.954801200300075, "grad_norm": 0.9249377250671387, "learning_rate": 2e-05, "loss": 0.6314, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15273, "tokens_per_second_per_gpu": 10308.41, "total_tokens": 1507920445 }, { "epoch": 0.9548637159289822, "grad_norm": 0.8639211654663086, "learning_rate": 2e-05, "loss": 0.5881, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15274, "tokens_per_second_per_gpu": 10088.62, "total_tokens": 1508019802 }, { "epoch": 0.9549262315578895, "grad_norm": 0.8789044618606567, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15275, "tokens_per_second_per_gpu": 11572.27, "total_tokens": 1508123400 }, { "epoch": 0.9549887471867967, "grad_norm": 0.9057657122612, "learning_rate": 2e-05, "loss": 0.5919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15276, "tokens_per_second_per_gpu": 10802.25, "total_tokens": 1508220718 }, { "epoch": 0.955051262815704, "grad_norm": 0.8751125335693359, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15277, "tokens_per_second_per_gpu": 10970.66, "total_tokens": 1508321053 }, { "epoch": 0.9551137784446112, "grad_norm": 0.8517187833786011, "learning_rate": 2e-05, "loss": 0.5805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15278, "tokens_per_second_per_gpu": 10937.58, "total_tokens": 1508423151 }, { "epoch": 0.9551762940735183, "grad_norm": 0.9196893572807312, "learning_rate": 2e-05, "loss": 0.596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15279, "tokens_per_second_per_gpu": 9667.18, "total_tokens": 1508517263 }, { "epoch": 0.9552388097024256, "grad_norm": 0.8861749768257141, "learning_rate": 2e-05, "loss": 0.6454, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15280, "tokens_per_second_per_gpu": 10514.45, "total_tokens": 1508618101 }, { "epoch": 0.9553013253313328, "grad_norm": 0.894863486289978, "learning_rate": 2e-05, "loss": 0.5974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15281, "tokens_per_second_per_gpu": 10836.14, "total_tokens": 1508713989 }, { "epoch": 0.9553638409602401, "grad_norm": 0.8950381875038147, "learning_rate": 2e-05, "loss": 0.6136, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15282, "tokens_per_second_per_gpu": 10041.76, "total_tokens": 1508809305 }, { "epoch": 0.9554263565891473, "grad_norm": 0.897911787033081, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15283, "tokens_per_second_per_gpu": 10372.7, "total_tokens": 1508906559 }, { "epoch": 0.9554888722180546, "grad_norm": 0.901168942451477, "learning_rate": 2e-05, "loss": 0.6437, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15284, "tokens_per_second_per_gpu": 10656.2, "total_tokens": 1509010324 }, { "epoch": 0.9555513878469617, "grad_norm": 0.8853203058242798, "learning_rate": 2e-05, "loss": 0.6012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15285, "tokens_per_second_per_gpu": 10275.74, "total_tokens": 1509106303 }, { "epoch": 0.9556139034758689, "grad_norm": 0.8617010116577148, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15286, "tokens_per_second_per_gpu": 10073.83, "total_tokens": 1509205638 }, { "epoch": 0.9556764191047762, "grad_norm": 0.8976972699165344, "learning_rate": 2e-05, "loss": 0.5823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15287, "tokens_per_second_per_gpu": 10068.08, "total_tokens": 1509302583 }, { "epoch": 0.9557389347336834, "grad_norm": 0.8804864287376404, "learning_rate": 2e-05, "loss": 0.6133, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15288, "tokens_per_second_per_gpu": 11140.92, "total_tokens": 1509403003 }, { "epoch": 0.9558014503625907, "grad_norm": 0.9084903001785278, "learning_rate": 2e-05, "loss": 0.6383, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15289, "tokens_per_second_per_gpu": 10800.01, "total_tokens": 1509503494 }, { "epoch": 0.9558639659914979, "grad_norm": 0.8722193837165833, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15290, "tokens_per_second_per_gpu": 10872.46, "total_tokens": 1509606095 }, { "epoch": 0.955926481620405, "grad_norm": 0.9075968861579895, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15291, "tokens_per_second_per_gpu": 10804.21, "total_tokens": 1509704693 }, { "epoch": 0.9559889972493123, "grad_norm": 0.9129265546798706, "learning_rate": 2e-05, "loss": 0.5889, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15292, "tokens_per_second_per_gpu": 10152.97, "total_tokens": 1509805334 }, { "epoch": 0.9560515128782195, "grad_norm": 0.8724355697631836, "learning_rate": 2e-05, "loss": 0.5636, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15293, "tokens_per_second_per_gpu": 10267.52, "total_tokens": 1509900705 }, { "epoch": 0.9561140285071268, "grad_norm": 0.8723388910293579, "learning_rate": 2e-05, "loss": 0.644, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15294, "tokens_per_second_per_gpu": 10090.73, "total_tokens": 1510000658 }, { "epoch": 0.956176544136034, "grad_norm": 0.8725665807723999, "learning_rate": 2e-05, "loss": 0.5789, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15295, "tokens_per_second_per_gpu": 10094.45, "total_tokens": 1510099415 }, { "epoch": 0.9562390597649413, "grad_norm": 0.8899799585342407, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15296, "tokens_per_second_per_gpu": 10978.47, "total_tokens": 1510200622 }, { "epoch": 0.9563015753938484, "grad_norm": 0.9016556739807129, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15297, "tokens_per_second_per_gpu": 10671.42, "total_tokens": 1510298619 }, { "epoch": 0.9563640910227557, "grad_norm": 0.9186118841171265, "learning_rate": 2e-05, "loss": 0.6118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15298, "tokens_per_second_per_gpu": 10680.25, "total_tokens": 1510400075 }, { "epoch": 0.9564266066516629, "grad_norm": 0.8654956221580505, "learning_rate": 2e-05, "loss": 0.5845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15299, "tokens_per_second_per_gpu": 10694.87, "total_tokens": 1510498236 }, { "epoch": 0.9564891222805701, "grad_norm": 0.9807938933372498, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15300, "tokens_per_second_per_gpu": 10125.23, "total_tokens": 1510597250 }, { "epoch": 0.9565516379094774, "grad_norm": 0.8825690746307373, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15301, "tokens_per_second_per_gpu": 10540.28, "total_tokens": 1510693960 }, { "epoch": 0.9566141535383846, "grad_norm": 0.9182253479957581, "learning_rate": 2e-05, "loss": 0.6727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15302, "tokens_per_second_per_gpu": 10710.82, "total_tokens": 1510793479 }, { "epoch": 0.9566766691672918, "grad_norm": 0.9152700304985046, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15303, "tokens_per_second_per_gpu": 11229.41, "total_tokens": 1510891605 }, { "epoch": 0.956739184796199, "grad_norm": 0.8986698985099792, "learning_rate": 2e-05, "loss": 0.5935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15304, "tokens_per_second_per_gpu": 10283.37, "total_tokens": 1510990461 }, { "epoch": 0.9568017004251063, "grad_norm": 0.8773630857467651, "learning_rate": 2e-05, "loss": 0.612, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15305, "tokens_per_second_per_gpu": 10619.88, "total_tokens": 1511092670 }, { "epoch": 0.9568642160540135, "grad_norm": 0.8811764121055603, "learning_rate": 2e-05, "loss": 0.6204, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15306, "tokens_per_second_per_gpu": 10420.66, "total_tokens": 1511193165 }, { "epoch": 0.9569267316829208, "grad_norm": 0.9118132591247559, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15307, "tokens_per_second_per_gpu": 9902.71, "total_tokens": 1511291937 }, { "epoch": 0.956989247311828, "grad_norm": 0.870646059513092, "learning_rate": 2e-05, "loss": 0.5798, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15308, "tokens_per_second_per_gpu": 10192.36, "total_tokens": 1511389792 }, { "epoch": 0.9570517629407351, "grad_norm": 0.9049181342124939, "learning_rate": 2e-05, "loss": 0.5894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15309, "tokens_per_second_per_gpu": 10237.05, "total_tokens": 1511484342 }, { "epoch": 0.9571142785696424, "grad_norm": 0.8889354467391968, "learning_rate": 2e-05, "loss": 0.5906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15310, "tokens_per_second_per_gpu": 10397.3, "total_tokens": 1511583447 }, { "epoch": 0.9571767941985496, "grad_norm": 0.8828863501548767, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15311, "tokens_per_second_per_gpu": 10490.17, "total_tokens": 1511682060 }, { "epoch": 0.9572393098274569, "grad_norm": 0.8992711305618286, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15312, "tokens_per_second_per_gpu": 10907.88, "total_tokens": 1511782494 }, { "epoch": 0.9573018254563641, "grad_norm": 0.8643926978111267, "learning_rate": 2e-05, "loss": 0.5618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15313, "tokens_per_second_per_gpu": 10819.65, "total_tokens": 1511878257 }, { "epoch": 0.9573643410852714, "grad_norm": 0.9354638457298279, "learning_rate": 2e-05, "loss": 0.5904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15314, "tokens_per_second_per_gpu": 9602.75, "total_tokens": 1511973357 }, { "epoch": 0.9574268567141786, "grad_norm": 0.8812751770019531, "learning_rate": 2e-05, "loss": 0.5902, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15315, "tokens_per_second_per_gpu": 10655.51, "total_tokens": 1512075094 }, { "epoch": 0.9574893723430857, "grad_norm": 0.9266936182975769, "learning_rate": 2e-05, "loss": 0.6443, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15316, "tokens_per_second_per_gpu": 10299.55, "total_tokens": 1512173241 }, { "epoch": 0.957551887971993, "grad_norm": 0.849461555480957, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15317, "tokens_per_second_per_gpu": 10820.99, "total_tokens": 1512274880 }, { "epoch": 0.9576144036009002, "grad_norm": 0.8919879794120789, "learning_rate": 2e-05, "loss": 0.6139, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15318, "tokens_per_second_per_gpu": 10319.73, "total_tokens": 1512373702 }, { "epoch": 0.9576769192298075, "grad_norm": 0.8864901065826416, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15319, "tokens_per_second_per_gpu": 10681.38, "total_tokens": 1512471729 }, { "epoch": 0.9577394348587147, "grad_norm": 0.9418928027153015, "learning_rate": 2e-05, "loss": 0.685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15320, "tokens_per_second_per_gpu": 9892.49, "total_tokens": 1512568248 }, { "epoch": 0.957801950487622, "grad_norm": 0.8704786896705627, "learning_rate": 2e-05, "loss": 0.552, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15321, "tokens_per_second_per_gpu": 10082.93, "total_tokens": 1512662709 }, { "epoch": 0.9578644661165291, "grad_norm": 0.8783136606216431, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15322, "tokens_per_second_per_gpu": 10669.47, "total_tokens": 1512764123 }, { "epoch": 0.9579269817454363, "grad_norm": 0.8776615262031555, "learning_rate": 2e-05, "loss": 0.6258, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15323, "tokens_per_second_per_gpu": 10781.25, "total_tokens": 1512865741 }, { "epoch": 0.9579894973743436, "grad_norm": 0.8353767991065979, "learning_rate": 2e-05, "loss": 0.5816, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15324, "tokens_per_second_per_gpu": 10956.52, "total_tokens": 1512965768 }, { "epoch": 0.9580520130032508, "grad_norm": 0.8596255779266357, "learning_rate": 2e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15325, "tokens_per_second_per_gpu": 10264.8, "total_tokens": 1513067716 }, { "epoch": 0.9581145286321581, "grad_norm": 0.9481082558631897, "learning_rate": 2e-05, "loss": 0.6442, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15326, "tokens_per_second_per_gpu": 10692.26, "total_tokens": 1513167112 }, { "epoch": 0.9581770442610653, "grad_norm": 0.9075397849082947, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15327, "tokens_per_second_per_gpu": 11662.28, "total_tokens": 1513270463 }, { "epoch": 0.9582395598899724, "grad_norm": 0.9010629057884216, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15328, "tokens_per_second_per_gpu": 10663.16, "total_tokens": 1513371554 }, { "epoch": 0.9583020755188797, "grad_norm": 0.8725638389587402, "learning_rate": 2e-05, "loss": 0.5916, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15329, "tokens_per_second_per_gpu": 10771.38, "total_tokens": 1513468961 }, { "epoch": 0.9583645911477869, "grad_norm": 0.8468940854072571, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15330, "tokens_per_second_per_gpu": 11017.91, "total_tokens": 1513572830 }, { "epoch": 0.9584271067766942, "grad_norm": 0.9242638945579529, "learning_rate": 2e-05, "loss": 0.5782, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15331, "tokens_per_second_per_gpu": 9192.62, "total_tokens": 1513661102 }, { "epoch": 0.9584896224056014, "grad_norm": 0.8729480504989624, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15332, "tokens_per_second_per_gpu": 10316.57, "total_tokens": 1513760720 }, { "epoch": 0.9585521380345087, "grad_norm": 0.8821650743484497, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15333, "tokens_per_second_per_gpu": 10698.04, "total_tokens": 1513858659 }, { "epoch": 0.9586146536634158, "grad_norm": 0.8895883560180664, "learning_rate": 2e-05, "loss": 0.5819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15334, "tokens_per_second_per_gpu": 10912.15, "total_tokens": 1513954394 }, { "epoch": 0.958677169292323, "grad_norm": 0.8726145625114441, "learning_rate": 2e-05, "loss": 0.6087, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15335, "tokens_per_second_per_gpu": 10172.54, "total_tokens": 1514051469 }, { "epoch": 0.9587396849212303, "grad_norm": 0.9007419943809509, "learning_rate": 2e-05, "loss": 0.6222, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15336, "tokens_per_second_per_gpu": 9762.5, "total_tokens": 1514149058 }, { "epoch": 0.9588022005501375, "grad_norm": 0.893876850605011, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15337, "tokens_per_second_per_gpu": 10187.96, "total_tokens": 1514245426 }, { "epoch": 0.9588647161790448, "grad_norm": 0.8822686076164246, "learning_rate": 2e-05, "loss": 0.5905, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15338, "tokens_per_second_per_gpu": 10702.4, "total_tokens": 1514342691 }, { "epoch": 0.958927231807952, "grad_norm": 0.8555301427841187, "learning_rate": 2e-05, "loss": 0.5977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15339, "tokens_per_second_per_gpu": 10833.02, "total_tokens": 1514440134 }, { "epoch": 0.9589897474368592, "grad_norm": 0.8882433772087097, "learning_rate": 2e-05, "loss": 0.5755, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15340, "tokens_per_second_per_gpu": 9527.41, "total_tokens": 1514532261 }, { "epoch": 0.9590522630657664, "grad_norm": 0.9096015095710754, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15341, "tokens_per_second_per_gpu": 9959.19, "total_tokens": 1514628235 }, { "epoch": 0.9591147786946737, "grad_norm": 0.8944178819656372, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15342, "tokens_per_second_per_gpu": 10169.29, "total_tokens": 1514726605 }, { "epoch": 0.9591772943235809, "grad_norm": 0.9316334128379822, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15343, "tokens_per_second_per_gpu": 9547.02, "total_tokens": 1514821065 }, { "epoch": 0.9592398099524881, "grad_norm": 0.9008394479751587, "learning_rate": 2e-05, "loss": 0.5768, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15344, "tokens_per_second_per_gpu": 10030.04, "total_tokens": 1514914674 }, { "epoch": 0.9593023255813954, "grad_norm": 0.8968571424484253, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15345, "tokens_per_second_per_gpu": 11430.39, "total_tokens": 1515015658 }, { "epoch": 0.9593648412103025, "grad_norm": 0.9031557440757751, "learning_rate": 2e-05, "loss": 0.6148, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15346, "tokens_per_second_per_gpu": 11163.93, "total_tokens": 1515112688 }, { "epoch": 0.9594273568392098, "grad_norm": 0.8725360631942749, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15347, "tokens_per_second_per_gpu": 10907.82, "total_tokens": 1515213694 }, { "epoch": 0.959489872468117, "grad_norm": 0.8867828845977783, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15348, "tokens_per_second_per_gpu": 11111.86, "total_tokens": 1515314128 }, { "epoch": 0.9595523880970243, "grad_norm": 0.8599992990493774, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15349, "tokens_per_second_per_gpu": 10494.75, "total_tokens": 1515414295 }, { "epoch": 0.9596149037259315, "grad_norm": 0.8600468635559082, "learning_rate": 2e-05, "loss": 0.5825, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15350, "tokens_per_second_per_gpu": 10120.95, "total_tokens": 1515510791 }, { "epoch": 0.9596774193548387, "grad_norm": 0.8802189230918884, "learning_rate": 2e-05, "loss": 0.6046, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15351, "tokens_per_second_per_gpu": 11366.46, "total_tokens": 1515611362 }, { "epoch": 0.959739934983746, "grad_norm": 0.9602874517440796, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15352, "tokens_per_second_per_gpu": 10071.34, "total_tokens": 1515709151 }, { "epoch": 0.9598024506126531, "grad_norm": 0.9300311207771301, "learning_rate": 2e-05, "loss": 0.6053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15353, "tokens_per_second_per_gpu": 10077.61, "total_tokens": 1515807341 }, { "epoch": 0.9598649662415604, "grad_norm": 0.942427396774292, "learning_rate": 2e-05, "loss": 0.6645, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15354, "tokens_per_second_per_gpu": 10483.14, "total_tokens": 1515903900 }, { "epoch": 0.9599274818704676, "grad_norm": 0.894413411617279, "learning_rate": 2e-05, "loss": 0.6003, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15355, "tokens_per_second_per_gpu": 10165.45, "total_tokens": 1515999044 }, { "epoch": 0.9599899974993749, "grad_norm": 0.9190333485603333, "learning_rate": 2e-05, "loss": 0.6593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15356, "tokens_per_second_per_gpu": 10577.66, "total_tokens": 1516100047 }, { "epoch": 0.9600525131282821, "grad_norm": 0.9230753183364868, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15357, "tokens_per_second_per_gpu": 10846.39, "total_tokens": 1516196991 }, { "epoch": 0.9601150287571893, "grad_norm": 0.9127322435379028, "learning_rate": 2e-05, "loss": 0.6405, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15358, "tokens_per_second_per_gpu": 10026.06, "total_tokens": 1516294959 }, { "epoch": 0.9601775443860965, "grad_norm": 0.8889802694320679, "learning_rate": 2e-05, "loss": 0.6315, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15359, "tokens_per_second_per_gpu": 10694.7, "total_tokens": 1516394555 }, { "epoch": 0.9602400600150037, "grad_norm": 0.886493444442749, "learning_rate": 2e-05, "loss": 0.6388, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15360, "tokens_per_second_per_gpu": 10775.08, "total_tokens": 1516492904 }, { "epoch": 0.960302575643911, "grad_norm": 0.8594564199447632, "learning_rate": 2e-05, "loss": 0.623, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15361, "tokens_per_second_per_gpu": 11028.94, "total_tokens": 1516595606 }, { "epoch": 0.9603650912728182, "grad_norm": 0.876211941242218, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15362, "tokens_per_second_per_gpu": 10580.97, "total_tokens": 1516698543 }, { "epoch": 0.9604276069017255, "grad_norm": 0.9059147834777832, "learning_rate": 2e-05, "loss": 0.5977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15363, "tokens_per_second_per_gpu": 10362.98, "total_tokens": 1516794445 }, { "epoch": 0.9604901225306327, "grad_norm": 0.9503108263015747, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15364, "tokens_per_second_per_gpu": 10653.44, "total_tokens": 1516890372 }, { "epoch": 0.9605526381595398, "grad_norm": 0.8840279579162598, "learning_rate": 2e-05, "loss": 0.5731, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15365, "tokens_per_second_per_gpu": 10976.6, "total_tokens": 1516990519 }, { "epoch": 0.9606151537884471, "grad_norm": 0.8879490494728088, "learning_rate": 2e-05, "loss": 0.6456, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15366, "tokens_per_second_per_gpu": 10768.18, "total_tokens": 1517091836 }, { "epoch": 0.9606776694173543, "grad_norm": 0.9291477203369141, "learning_rate": 2e-05, "loss": 0.6307, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15367, "tokens_per_second_per_gpu": 9561.46, "total_tokens": 1517189343 }, { "epoch": 0.9607401850462616, "grad_norm": 0.8844820261001587, "learning_rate": 2e-05, "loss": 0.5774, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15368, "tokens_per_second_per_gpu": 10604.54, "total_tokens": 1517285051 }, { "epoch": 0.9608027006751688, "grad_norm": 0.9002164602279663, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15369, "tokens_per_second_per_gpu": 10692.82, "total_tokens": 1517380917 }, { "epoch": 0.9608652163040761, "grad_norm": 0.8906503319740295, "learning_rate": 2e-05, "loss": 0.5964, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15370, "tokens_per_second_per_gpu": 10748.29, "total_tokens": 1517481993 }, { "epoch": 0.9609277319329832, "grad_norm": 0.8751025795936584, "learning_rate": 2e-05, "loss": 0.5823, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15371, "tokens_per_second_per_gpu": 9746.16, "total_tokens": 1517573354 }, { "epoch": 0.9609902475618904, "grad_norm": 0.8676263093948364, "learning_rate": 2e-05, "loss": 0.5999, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15372, "tokens_per_second_per_gpu": 11023.7, "total_tokens": 1517668008 }, { "epoch": 0.9610527631907977, "grad_norm": 0.8908246755599976, "learning_rate": 2e-05, "loss": 0.5897, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15373, "tokens_per_second_per_gpu": 10522.01, "total_tokens": 1517768100 }, { "epoch": 0.9611152788197049, "grad_norm": 0.9037086963653564, "learning_rate": 2e-05, "loss": 0.6017, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15374, "tokens_per_second_per_gpu": 10470.87, "total_tokens": 1517867393 }, { "epoch": 0.9611777944486122, "grad_norm": 1.1757841110229492, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15375, "tokens_per_second_per_gpu": 10511.67, "total_tokens": 1517963896 }, { "epoch": 0.9612403100775194, "grad_norm": 0.8862216472625732, "learning_rate": 2e-05, "loss": 0.5912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15376, "tokens_per_second_per_gpu": 10065.04, "total_tokens": 1518061468 }, { "epoch": 0.9613028257064266, "grad_norm": 0.8253942131996155, "learning_rate": 2e-05, "loss": 0.5427, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15377, "tokens_per_second_per_gpu": 10898.61, "total_tokens": 1518159240 }, { "epoch": 0.9613653413353338, "grad_norm": 0.908174991607666, "learning_rate": 2e-05, "loss": 0.6291, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15378, "tokens_per_second_per_gpu": 9859.79, "total_tokens": 1518255164 }, { "epoch": 0.961427856964241, "grad_norm": 0.8818801045417786, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15379, "tokens_per_second_per_gpu": 10051.34, "total_tokens": 1518350833 }, { "epoch": 0.9614903725931483, "grad_norm": 0.9133412837982178, "learning_rate": 2e-05, "loss": 0.5968, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15380, "tokens_per_second_per_gpu": 8849.88, "total_tokens": 1518442411 }, { "epoch": 0.9615528882220555, "grad_norm": 0.9438663721084595, "learning_rate": 2e-05, "loss": 0.6386, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15381, "tokens_per_second_per_gpu": 10228.89, "total_tokens": 1518540993 }, { "epoch": 0.9616154038509628, "grad_norm": 0.8744282722473145, "learning_rate": 2e-05, "loss": 0.5675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15382, "tokens_per_second_per_gpu": 10282.85, "total_tokens": 1518636977 }, { "epoch": 0.9616779194798699, "grad_norm": 0.8901247382164001, "learning_rate": 2e-05, "loss": 0.6621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15383, "tokens_per_second_per_gpu": 11280.13, "total_tokens": 1518740722 }, { "epoch": 0.9617404351087772, "grad_norm": 0.8338512778282166, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15384, "tokens_per_second_per_gpu": 10594.16, "total_tokens": 1518841967 }, { "epoch": 0.9618029507376844, "grad_norm": 0.8500287532806396, "learning_rate": 2e-05, "loss": 0.578, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15385, "tokens_per_second_per_gpu": 10575.25, "total_tokens": 1518942670 }, { "epoch": 0.9618654663665916, "grad_norm": 0.9091889262199402, "learning_rate": 2e-05, "loss": 0.5663, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15386, "tokens_per_second_per_gpu": 9737.3, "total_tokens": 1519036741 }, { "epoch": 0.9619279819954989, "grad_norm": 0.8757301568984985, "learning_rate": 2e-05, "loss": 0.5786, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15387, "tokens_per_second_per_gpu": 10877.63, "total_tokens": 1519135697 }, { "epoch": 0.9619904976244061, "grad_norm": 0.872256338596344, "learning_rate": 2e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15388, "tokens_per_second_per_gpu": 11197.53, "total_tokens": 1519237856 }, { "epoch": 0.9620530132533134, "grad_norm": 0.8789654970169067, "learning_rate": 2e-05, "loss": 0.6005, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15389, "tokens_per_second_per_gpu": 9990.91, "total_tokens": 1519334659 }, { "epoch": 0.9621155288822205, "grad_norm": 0.8796861171722412, "learning_rate": 2e-05, "loss": 0.5954, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15390, "tokens_per_second_per_gpu": 10184.62, "total_tokens": 1519431214 }, { "epoch": 0.9621780445111278, "grad_norm": 0.9338942766189575, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15391, "tokens_per_second_per_gpu": 10321.63, "total_tokens": 1519527240 }, { "epoch": 0.962240560140035, "grad_norm": 0.867901086807251, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15392, "tokens_per_second_per_gpu": 10131.93, "total_tokens": 1519628039 }, { "epoch": 0.9623030757689423, "grad_norm": 0.884486734867096, "learning_rate": 2e-05, "loss": 0.6273, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15393, "tokens_per_second_per_gpu": 10841.66, "total_tokens": 1519728909 }, { "epoch": 0.9623655913978495, "grad_norm": 0.96242755651474, "learning_rate": 2e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15394, "tokens_per_second_per_gpu": 10313.91, "total_tokens": 1519823740 }, { "epoch": 0.9624281070267567, "grad_norm": 0.8637381792068481, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15395, "tokens_per_second_per_gpu": 11073.53, "total_tokens": 1519923443 }, { "epoch": 0.9624906226556639, "grad_norm": 0.8790519833564758, "learning_rate": 2e-05, "loss": 0.6239, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15396, "tokens_per_second_per_gpu": 10325.64, "total_tokens": 1520021965 }, { "epoch": 0.9625531382845711, "grad_norm": 0.8671929836273193, "learning_rate": 2e-05, "loss": 0.5853, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15397, "tokens_per_second_per_gpu": 9718.5, "total_tokens": 1520118497 }, { "epoch": 0.9626156539134784, "grad_norm": 0.8597872853279114, "learning_rate": 2e-05, "loss": 0.562, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15398, "tokens_per_second_per_gpu": 10143.58, "total_tokens": 1520213577 }, { "epoch": 0.9626781695423856, "grad_norm": 0.8806391358375549, "learning_rate": 2e-05, "loss": 0.679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15399, "tokens_per_second_per_gpu": 10539.74, "total_tokens": 1520314046 }, { "epoch": 0.9627406851712929, "grad_norm": 0.9012643098831177, "learning_rate": 2e-05, "loss": 0.6495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15400, "tokens_per_second_per_gpu": 10582.44, "total_tokens": 1520414372 }, { "epoch": 0.9628032008002001, "grad_norm": 0.9048773050308228, "learning_rate": 2e-05, "loss": 0.6448, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15401, "tokens_per_second_per_gpu": 11483.62, "total_tokens": 1520518513 }, { "epoch": 0.9628657164291072, "grad_norm": 0.9042850732803345, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15402, "tokens_per_second_per_gpu": 10651.48, "total_tokens": 1520617145 }, { "epoch": 0.9629282320580145, "grad_norm": 0.9235382676124573, "learning_rate": 2e-05, "loss": 0.6413, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15403, "tokens_per_second_per_gpu": 10217.47, "total_tokens": 1520714989 }, { "epoch": 0.9629907476869217, "grad_norm": 0.8759711980819702, "learning_rate": 2e-05, "loss": 0.6098, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15404, "tokens_per_second_per_gpu": 10119.37, "total_tokens": 1520814506 }, { "epoch": 0.963053263315829, "grad_norm": 0.9097321033477783, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15405, "tokens_per_second_per_gpu": 10541.1, "total_tokens": 1520911732 }, { "epoch": 0.9631157789447362, "grad_norm": 0.9083371758460999, "learning_rate": 2e-05, "loss": 0.5858, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15406, "tokens_per_second_per_gpu": 9994.97, "total_tokens": 1521008692 }, { "epoch": 0.9631782945736435, "grad_norm": 0.8762295246124268, "learning_rate": 2e-05, "loss": 0.596, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15407, "tokens_per_second_per_gpu": 10991.48, "total_tokens": 1521104866 }, { "epoch": 0.9632408102025506, "grad_norm": 0.9076113700866699, "learning_rate": 2e-05, "loss": 0.6542, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15408, "tokens_per_second_per_gpu": 10295.2, "total_tokens": 1521200967 }, { "epoch": 0.9633033258314578, "grad_norm": 0.875411868095398, "learning_rate": 2e-05, "loss": 0.6182, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15409, "tokens_per_second_per_gpu": 11559.35, "total_tokens": 1521304823 }, { "epoch": 0.9633658414603651, "grad_norm": 0.8584581017494202, "learning_rate": 2e-05, "loss": 0.5788, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15410, "tokens_per_second_per_gpu": 10203.83, "total_tokens": 1521405065 }, { "epoch": 0.9634283570892723, "grad_norm": 0.9162997603416443, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15411, "tokens_per_second_per_gpu": 11392.92, "total_tokens": 1521504795 }, { "epoch": 0.9634908727181796, "grad_norm": 0.8679909110069275, "learning_rate": 2e-05, "loss": 0.5901, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15412, "tokens_per_second_per_gpu": 10599.15, "total_tokens": 1521605748 }, { "epoch": 0.9635533883470868, "grad_norm": 0.9273282885551453, "learning_rate": 2e-05, "loss": 0.5915, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15413, "tokens_per_second_per_gpu": 9828.92, "total_tokens": 1521696553 }, { "epoch": 0.963615903975994, "grad_norm": 0.9014376997947693, "learning_rate": 2e-05, "loss": 0.5845, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15414, "tokens_per_second_per_gpu": 10947.38, "total_tokens": 1521788713 }, { "epoch": 0.9636784196049012, "grad_norm": 0.8941258192062378, "learning_rate": 2e-05, "loss": 0.6053, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15415, "tokens_per_second_per_gpu": 9649.8, "total_tokens": 1521884510 }, { "epoch": 0.9637409352338084, "grad_norm": 0.8954280614852905, "learning_rate": 2e-05, "loss": 0.6121, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15416, "tokens_per_second_per_gpu": 10172.66, "total_tokens": 1521981806 }, { "epoch": 0.9638034508627157, "grad_norm": 0.8976242542266846, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15417, "tokens_per_second_per_gpu": 10094.74, "total_tokens": 1522081061 }, { "epoch": 0.9638659664916229, "grad_norm": 0.8875283598899841, "learning_rate": 2e-05, "loss": 0.5679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15418, "tokens_per_second_per_gpu": 10422.57, "total_tokens": 1522178070 }, { "epoch": 0.9639284821205302, "grad_norm": 0.8953995108604431, "learning_rate": 2e-05, "loss": 0.5531, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15419, "tokens_per_second_per_gpu": 9124.33, "total_tokens": 1522267924 }, { "epoch": 0.9639909977494373, "grad_norm": 0.9529130458831787, "learning_rate": 2e-05, "loss": 0.5987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15420, "tokens_per_second_per_gpu": 10247.56, "total_tokens": 1522363180 }, { "epoch": 0.9640535133783446, "grad_norm": 0.9215424656867981, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15421, "tokens_per_second_per_gpu": 10382.07, "total_tokens": 1522461169 }, { "epoch": 0.9641160290072518, "grad_norm": 0.8879169821739197, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15422, "tokens_per_second_per_gpu": 10759.93, "total_tokens": 1522561069 }, { "epoch": 0.964178544636159, "grad_norm": 0.9028314352035522, "learning_rate": 2e-05, "loss": 0.6021, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15423, "tokens_per_second_per_gpu": 10179.57, "total_tokens": 1522657778 }, { "epoch": 0.9642410602650663, "grad_norm": 0.873145580291748, "learning_rate": 2e-05, "loss": 0.6369, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15424, "tokens_per_second_per_gpu": 10649.98, "total_tokens": 1522760832 }, { "epoch": 0.9643035758939735, "grad_norm": 0.8721334338188171, "learning_rate": 2e-05, "loss": 0.5665, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15425, "tokens_per_second_per_gpu": 11053.24, "total_tokens": 1522858282 }, { "epoch": 0.9643660915228808, "grad_norm": 0.8753096461296082, "learning_rate": 2e-05, "loss": 0.5679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15426, "tokens_per_second_per_gpu": 9817.42, "total_tokens": 1522953409 }, { "epoch": 0.9644286071517879, "grad_norm": 0.9370524287223816, "learning_rate": 2e-05, "loss": 0.6342, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15427, "tokens_per_second_per_gpu": 10945.41, "total_tokens": 1523053851 }, { "epoch": 0.9644911227806952, "grad_norm": 0.8954775929450989, "learning_rate": 2e-05, "loss": 0.6093, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15428, "tokens_per_second_per_gpu": 11001.31, "total_tokens": 1523151521 }, { "epoch": 0.9645536384096024, "grad_norm": 0.8937773704528809, "learning_rate": 2e-05, "loss": 0.6117, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15429, "tokens_per_second_per_gpu": 10695.82, "total_tokens": 1523250490 }, { "epoch": 0.9646161540385096, "grad_norm": 0.8701948523521423, "learning_rate": 2e-05, "loss": 0.697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15430, "tokens_per_second_per_gpu": 10846.66, "total_tokens": 1523353471 }, { "epoch": 0.9646786696674169, "grad_norm": 0.8511091470718384, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15431, "tokens_per_second_per_gpu": 11022.82, "total_tokens": 1523456271 }, { "epoch": 0.9647411852963241, "grad_norm": 0.8635867238044739, "learning_rate": 2e-05, "loss": 0.5688, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15432, "tokens_per_second_per_gpu": 9961.41, "total_tokens": 1523552580 }, { "epoch": 0.9648037009252313, "grad_norm": 0.9013525247573853, "learning_rate": 2e-05, "loss": 0.5906, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15433, "tokens_per_second_per_gpu": 10185.52, "total_tokens": 1523647108 }, { "epoch": 0.9648662165541385, "grad_norm": 0.9283854365348816, "learning_rate": 2e-05, "loss": 0.5546, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15434, "tokens_per_second_per_gpu": 8696.36, "total_tokens": 1523739033 }, { "epoch": 0.9649287321830458, "grad_norm": 0.9272287487983704, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15435, "tokens_per_second_per_gpu": 10390.1, "total_tokens": 1523839447 }, { "epoch": 0.964991247811953, "grad_norm": 0.9076550602912903, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15436, "tokens_per_second_per_gpu": 10380.67, "total_tokens": 1523935870 }, { "epoch": 0.9650537634408602, "grad_norm": 0.8849819898605347, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15437, "tokens_per_second_per_gpu": 11128.49, "total_tokens": 1524039025 }, { "epoch": 0.9651162790697675, "grad_norm": 0.8900612592697144, "learning_rate": 2e-05, "loss": 0.5923, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15438, "tokens_per_second_per_gpu": 10439.01, "total_tokens": 1524134768 }, { "epoch": 0.9651787946986746, "grad_norm": 0.9040641188621521, "learning_rate": 2e-05, "loss": 0.6066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15439, "tokens_per_second_per_gpu": 10201.79, "total_tokens": 1524230154 }, { "epoch": 0.9652413103275819, "grad_norm": 0.891392707824707, "learning_rate": 2e-05, "loss": 0.6049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15440, "tokens_per_second_per_gpu": 10508.73, "total_tokens": 1524328697 }, { "epoch": 0.9653038259564891, "grad_norm": 0.8680939078330994, "learning_rate": 2e-05, "loss": 0.611, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15441, "tokens_per_second_per_gpu": 10451.5, "total_tokens": 1524431409 }, { "epoch": 0.9653663415853964, "grad_norm": 0.922971785068512, "learning_rate": 2e-05, "loss": 0.5962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15442, "tokens_per_second_per_gpu": 10527.37, "total_tokens": 1524522498 }, { "epoch": 0.9654288572143036, "grad_norm": 0.9237827062606812, "learning_rate": 2e-05, "loss": 0.655, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15443, "tokens_per_second_per_gpu": 11074.85, "total_tokens": 1524619941 }, { "epoch": 0.9654913728432108, "grad_norm": 0.8738309741020203, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15444, "tokens_per_second_per_gpu": 10215.97, "total_tokens": 1524713390 }, { "epoch": 0.965553888472118, "grad_norm": 0.8791325688362122, "learning_rate": 2e-05, "loss": 0.5927, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15445, "tokens_per_second_per_gpu": 10519.8, "total_tokens": 1524808498 }, { "epoch": 0.9656164041010252, "grad_norm": 0.8563464879989624, "learning_rate": 2e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15446, "tokens_per_second_per_gpu": 10920.12, "total_tokens": 1524908536 }, { "epoch": 0.9656789197299325, "grad_norm": 0.8428649306297302, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15447, "tokens_per_second_per_gpu": 10995.28, "total_tokens": 1525009164 }, { "epoch": 0.9657414353588397, "grad_norm": 0.8522247672080994, "learning_rate": 2e-05, "loss": 0.5794, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15448, "tokens_per_second_per_gpu": 10352.76, "total_tokens": 1525106376 }, { "epoch": 0.965803950987747, "grad_norm": 0.8783209919929504, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15449, "tokens_per_second_per_gpu": 10471.55, "total_tokens": 1525202381 }, { "epoch": 0.9658664666166542, "grad_norm": 0.902961015701294, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15450, "tokens_per_second_per_gpu": 10110.6, "total_tokens": 1525299222 }, { "epoch": 0.9659289822455613, "grad_norm": 0.903778612613678, "learning_rate": 2e-05, "loss": 0.5857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15451, "tokens_per_second_per_gpu": 10339.14, "total_tokens": 1525395588 }, { "epoch": 0.9659914978744686, "grad_norm": 0.8655142784118652, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15452, "tokens_per_second_per_gpu": 10733.47, "total_tokens": 1525496568 }, { "epoch": 0.9660540135033758, "grad_norm": 0.9224599599838257, "learning_rate": 2e-05, "loss": 0.6497, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15453, "tokens_per_second_per_gpu": 11172.45, "total_tokens": 1525597296 }, { "epoch": 0.9661165291322831, "grad_norm": 0.892503023147583, "learning_rate": 2e-05, "loss": 0.6058, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15454, "tokens_per_second_per_gpu": 10158.25, "total_tokens": 1525692721 }, { "epoch": 0.9661790447611903, "grad_norm": 0.8980035185813904, "learning_rate": 2e-05, "loss": 0.6176, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15455, "tokens_per_second_per_gpu": 10623.4, "total_tokens": 1525792317 }, { "epoch": 0.9662415603900976, "grad_norm": 0.9507616758346558, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15456, "tokens_per_second_per_gpu": 10483.8, "total_tokens": 1525891411 }, { "epoch": 0.9663040760190047, "grad_norm": 0.9260314702987671, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15457, "tokens_per_second_per_gpu": 10689.11, "total_tokens": 1525990624 }, { "epoch": 0.9663665916479119, "grad_norm": 0.8862563371658325, "learning_rate": 2e-05, "loss": 0.6609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15458, "tokens_per_second_per_gpu": 10456.48, "total_tokens": 1526087651 }, { "epoch": 0.9664291072768192, "grad_norm": 0.8951948881149292, "learning_rate": 2e-05, "loss": 0.6253, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15459, "tokens_per_second_per_gpu": 11089.0, "total_tokens": 1526186423 }, { "epoch": 0.9664916229057264, "grad_norm": 0.930508017539978, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15460, "tokens_per_second_per_gpu": 10134.07, "total_tokens": 1526281877 }, { "epoch": 0.9665541385346337, "grad_norm": 0.9042463302612305, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15461, "tokens_per_second_per_gpu": 10286.96, "total_tokens": 1526379822 }, { "epoch": 0.9666166541635409, "grad_norm": 0.9037677049636841, "learning_rate": 2e-05, "loss": 0.6304, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15462, "tokens_per_second_per_gpu": 10147.04, "total_tokens": 1526473934 }, { "epoch": 0.9666791697924482, "grad_norm": 0.8967102766036987, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15463, "tokens_per_second_per_gpu": 10364.48, "total_tokens": 1526569795 }, { "epoch": 0.9667416854213553, "grad_norm": 0.9166309237480164, "learning_rate": 2e-05, "loss": 0.6095, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15464, "tokens_per_second_per_gpu": 10861.26, "total_tokens": 1526665782 }, { "epoch": 0.9668042010502625, "grad_norm": 0.8824495077133179, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15465, "tokens_per_second_per_gpu": 9921.15, "total_tokens": 1526764904 }, { "epoch": 0.9668667166791698, "grad_norm": 0.9059057831764221, "learning_rate": 2e-05, "loss": 0.6264, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15466, "tokens_per_second_per_gpu": 10671.57, "total_tokens": 1526862874 }, { "epoch": 0.966929232308077, "grad_norm": 0.8927135467529297, "learning_rate": 2e-05, "loss": 0.6095, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15467, "tokens_per_second_per_gpu": 16730.46, "total_tokens": 1526960251 }, { "epoch": 0.9669917479369843, "grad_norm": 0.8996100425720215, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15468, "tokens_per_second_per_gpu": 17815.32, "total_tokens": 1527060358 }, { "epoch": 0.9670542635658915, "grad_norm": 0.890582263469696, "learning_rate": 2e-05, "loss": 0.5862, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15469, "tokens_per_second_per_gpu": 17606.76, "total_tokens": 1527158061 }, { "epoch": 0.9671167791947987, "grad_norm": 0.8683497905731201, "learning_rate": 2e-05, "loss": 0.5912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15470, "tokens_per_second_per_gpu": 18281.93, "total_tokens": 1527257794 }, { "epoch": 0.9671792948237059, "grad_norm": 0.9174851179122925, "learning_rate": 2e-05, "loss": 0.6185, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15471, "tokens_per_second_per_gpu": 16950.63, "total_tokens": 1527356342 }, { "epoch": 0.9672418104526131, "grad_norm": 0.8711020946502686, "learning_rate": 2e-05, "loss": 0.5622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15472, "tokens_per_second_per_gpu": 16851.83, "total_tokens": 1527449426 }, { "epoch": 0.9673043260815204, "grad_norm": 0.8984836339950562, "learning_rate": 2e-05, "loss": 0.6049, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15473, "tokens_per_second_per_gpu": 16663.45, "total_tokens": 1527547693 }, { "epoch": 0.9673668417104276, "grad_norm": 0.9352002739906311, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15474, "tokens_per_second_per_gpu": 17403.01, "total_tokens": 1527641889 }, { "epoch": 0.9674293573393349, "grad_norm": 0.8779422640800476, "learning_rate": 2e-05, "loss": 0.567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15475, "tokens_per_second_per_gpu": 16463.07, "total_tokens": 1527738684 }, { "epoch": 0.967491872968242, "grad_norm": 0.8832951188087463, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15476, "tokens_per_second_per_gpu": 16698.52, "total_tokens": 1527837278 }, { "epoch": 0.9675543885971493, "grad_norm": 0.9113990068435669, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15477, "tokens_per_second_per_gpu": 17385.56, "total_tokens": 1527937376 }, { "epoch": 0.9676169042260565, "grad_norm": 0.8968557715415955, "learning_rate": 2e-05, "loss": 0.613, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15478, "tokens_per_second_per_gpu": 16724.51, "total_tokens": 1528032900 }, { "epoch": 0.9676794198549638, "grad_norm": 0.9518164992332458, "learning_rate": 2e-05, "loss": 0.6319, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15479, "tokens_per_second_per_gpu": 13810.81, "total_tokens": 1528131429 }, { "epoch": 0.967741935483871, "grad_norm": 0.8523252606391907, "learning_rate": 2e-05, "loss": 0.5855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15480, "tokens_per_second_per_gpu": 15588.19, "total_tokens": 1528230633 }, { "epoch": 0.9678044511127782, "grad_norm": 0.9108231067657471, "learning_rate": 2e-05, "loss": 0.5835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15481, "tokens_per_second_per_gpu": 17060.43, "total_tokens": 1528331677 }, { "epoch": 0.9678669667416854, "grad_norm": 0.9230881929397583, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15482, "tokens_per_second_per_gpu": 17060.24, "total_tokens": 1528429049 }, { "epoch": 0.9679294823705926, "grad_norm": 0.927368700504303, "learning_rate": 2e-05, "loss": 0.6295, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15483, "tokens_per_second_per_gpu": 17376.83, "total_tokens": 1528527336 }, { "epoch": 0.9679919979994999, "grad_norm": 0.8844575881958008, "learning_rate": 2e-05, "loss": 0.6373, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15484, "tokens_per_second_per_gpu": 13716.54, "total_tokens": 1528628464 }, { "epoch": 0.9680545136284071, "grad_norm": 0.8933666944503784, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15485, "tokens_per_second_per_gpu": 10048.33, "total_tokens": 1528724812 }, { "epoch": 0.9681170292573144, "grad_norm": 0.9134326577186584, "learning_rate": 2e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15486, "tokens_per_second_per_gpu": 10128.01, "total_tokens": 1528817018 }, { "epoch": 0.9681795448862216, "grad_norm": 0.9256297945976257, "learning_rate": 2e-05, "loss": 0.607, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15487, "tokens_per_second_per_gpu": 9929.88, "total_tokens": 1528911524 }, { "epoch": 0.9682420605151287, "grad_norm": 0.9217233657836914, "learning_rate": 2e-05, "loss": 0.6066, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15488, "tokens_per_second_per_gpu": 10063.21, "total_tokens": 1529004700 }, { "epoch": 0.968304576144036, "grad_norm": 0.9033342003822327, "learning_rate": 2e-05, "loss": 0.6209, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15489, "tokens_per_second_per_gpu": 10668.85, "total_tokens": 1529101959 }, { "epoch": 0.9683670917729432, "grad_norm": 0.8684918880462646, "learning_rate": 2e-05, "loss": 0.6335, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15490, "tokens_per_second_per_gpu": 10181.37, "total_tokens": 1529202302 }, { "epoch": 0.9684296074018505, "grad_norm": 0.899272084236145, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15491, "tokens_per_second_per_gpu": 10771.34, "total_tokens": 1529297622 }, { "epoch": 0.9684921230307577, "grad_norm": 0.8531783223152161, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15492, "tokens_per_second_per_gpu": 11231.6, "total_tokens": 1529401842 }, { "epoch": 0.968554638659665, "grad_norm": 0.9077761173248291, "learning_rate": 2e-05, "loss": 0.5951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15493, "tokens_per_second_per_gpu": 10743.33, "total_tokens": 1529501688 }, { "epoch": 0.9686171542885721, "grad_norm": 0.8765949010848999, "learning_rate": 2e-05, "loss": 0.6088, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15494, "tokens_per_second_per_gpu": 10199.47, "total_tokens": 1529601086 }, { "epoch": 0.9686796699174793, "grad_norm": 0.9364190697669983, "learning_rate": 2e-05, "loss": 0.6212, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15495, "tokens_per_second_per_gpu": 10710.48, "total_tokens": 1529696833 }, { "epoch": 0.9687421855463866, "grad_norm": 0.9636519551277161, "learning_rate": 2e-05, "loss": 0.6102, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15496, "tokens_per_second_per_gpu": 9602.3, "total_tokens": 1529792109 }, { "epoch": 0.9688047011752938, "grad_norm": 0.9072462916374207, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15497, "tokens_per_second_per_gpu": 10883.75, "total_tokens": 1529890639 }, { "epoch": 0.9688672168042011, "grad_norm": 0.9205415844917297, "learning_rate": 2e-05, "loss": 0.656, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15498, "tokens_per_second_per_gpu": 10988.8, "total_tokens": 1529989625 }, { "epoch": 0.9689297324331083, "grad_norm": 0.8649718761444092, "learning_rate": 2e-05, "loss": 0.6279, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15499, "tokens_per_second_per_gpu": 10742.58, "total_tokens": 1530091744 }, { "epoch": 0.9689922480620154, "grad_norm": 0.8792685270309448, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15500, "tokens_per_second_per_gpu": 10257.99, "total_tokens": 1530187542 }, { "epoch": 0.9690547636909227, "grad_norm": 0.8816508650779724, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15501, "tokens_per_second_per_gpu": 11582.37, "total_tokens": 1530288704 }, { "epoch": 0.9691172793198299, "grad_norm": 0.8858681917190552, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15502, "tokens_per_second_per_gpu": 10876.26, "total_tokens": 1530390627 }, { "epoch": 0.9691797949487372, "grad_norm": 0.9207919239997864, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15503, "tokens_per_second_per_gpu": 10742.98, "total_tokens": 1530491020 }, { "epoch": 0.9692423105776444, "grad_norm": 0.8823500871658325, "learning_rate": 2e-05, "loss": 0.601, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15504, "tokens_per_second_per_gpu": 10338.05, "total_tokens": 1530588968 }, { "epoch": 0.9693048262065517, "grad_norm": 0.9139980673789978, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15505, "tokens_per_second_per_gpu": 10348.78, "total_tokens": 1530684992 }, { "epoch": 0.9693673418354589, "grad_norm": 0.9223490357398987, "learning_rate": 2e-05, "loss": 0.6296, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15506, "tokens_per_second_per_gpu": 10085.24, "total_tokens": 1530782410 }, { "epoch": 0.969429857464366, "grad_norm": 0.8767056465148926, "learning_rate": 2e-05, "loss": 0.6012, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15507, "tokens_per_second_per_gpu": 9632.41, "total_tokens": 1530878705 }, { "epoch": 0.9694923730932733, "grad_norm": 0.9254288673400879, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15508, "tokens_per_second_per_gpu": 9808.1, "total_tokens": 1530970805 }, { "epoch": 0.9695548887221805, "grad_norm": 0.8781545758247375, "learning_rate": 2e-05, "loss": 0.5977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15509, "tokens_per_second_per_gpu": 10666.84, "total_tokens": 1531068011 }, { "epoch": 0.9696174043510878, "grad_norm": 0.9113538861274719, "learning_rate": 2e-05, "loss": 0.6762, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15510, "tokens_per_second_per_gpu": 11262.82, "total_tokens": 1531172377 }, { "epoch": 0.969679919979995, "grad_norm": 0.8843540549278259, "learning_rate": 2e-05, "loss": 0.6194, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15511, "tokens_per_second_per_gpu": 10327.85, "total_tokens": 1531269321 }, { "epoch": 0.9697424356089023, "grad_norm": 0.9076164960861206, "learning_rate": 2e-05, "loss": 0.6255, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15512, "tokens_per_second_per_gpu": 11578.2, "total_tokens": 1531371766 }, { "epoch": 0.9698049512378094, "grad_norm": 0.8733648061752319, "learning_rate": 2e-05, "loss": 0.6511, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15513, "tokens_per_second_per_gpu": 11492.59, "total_tokens": 1531475511 }, { "epoch": 0.9698674668667167, "grad_norm": 0.8845567107200623, "learning_rate": 2e-05, "loss": 0.6286, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15514, "tokens_per_second_per_gpu": 10935.1, "total_tokens": 1531578533 }, { "epoch": 0.9699299824956239, "grad_norm": 0.862524688243866, "learning_rate": 2e-05, "loss": 0.5989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15515, "tokens_per_second_per_gpu": 10137.84, "total_tokens": 1531675001 }, { "epoch": 0.9699924981245311, "grad_norm": 0.8946647047996521, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15516, "tokens_per_second_per_gpu": 9729.06, "total_tokens": 1531771500 }, { "epoch": 0.9700550137534384, "grad_norm": 0.9168473482131958, "learning_rate": 2e-05, "loss": 0.648, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15517, "tokens_per_second_per_gpu": 11192.23, "total_tokens": 1531875609 }, { "epoch": 0.9701175293823456, "grad_norm": 0.8853001594543457, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15518, "tokens_per_second_per_gpu": 10758.25, "total_tokens": 1531974181 }, { "epoch": 0.9701800450112528, "grad_norm": 0.9286967515945435, "learning_rate": 2e-05, "loss": 0.6193, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15519, "tokens_per_second_per_gpu": 10793.59, "total_tokens": 1532072214 }, { "epoch": 0.97024256064016, "grad_norm": 0.9083291292190552, "learning_rate": 2e-05, "loss": 0.6347, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15520, "tokens_per_second_per_gpu": 10439.98, "total_tokens": 1532169889 }, { "epoch": 0.9703050762690673, "grad_norm": 0.8835698962211609, "learning_rate": 2e-05, "loss": 0.5933, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15521, "tokens_per_second_per_gpu": 10615.12, "total_tokens": 1532268186 }, { "epoch": 0.9703675918979745, "grad_norm": 0.9132490158081055, "learning_rate": 2e-05, "loss": 0.647, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15522, "tokens_per_second_per_gpu": 11214.95, "total_tokens": 1532369821 }, { "epoch": 0.9704301075268817, "grad_norm": 0.8850820660591125, "learning_rate": 2e-05, "loss": 0.5947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15523, "tokens_per_second_per_gpu": 9502.03, "total_tokens": 1532464471 }, { "epoch": 0.970492623155789, "grad_norm": 0.9006229639053345, "learning_rate": 2e-05, "loss": 0.6543, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15524, "tokens_per_second_per_gpu": 10538.78, "total_tokens": 1532561720 }, { "epoch": 0.9705551387846961, "grad_norm": 0.8875634670257568, "learning_rate": 2e-05, "loss": 0.5877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15525, "tokens_per_second_per_gpu": 10480.08, "total_tokens": 1532661111 }, { "epoch": 0.9706176544136034, "grad_norm": 0.9123979210853577, "learning_rate": 2e-05, "loss": 0.5793, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15526, "tokens_per_second_per_gpu": 9779.87, "total_tokens": 1532756504 }, { "epoch": 0.9706801700425106, "grad_norm": 0.8588495850563049, "learning_rate": 2e-05, "loss": 0.6309, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15527, "tokens_per_second_per_gpu": 10770.83, "total_tokens": 1532859060 }, { "epoch": 0.9707426856714179, "grad_norm": 0.9561026692390442, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15528, "tokens_per_second_per_gpu": 9420.25, "total_tokens": 1532954541 }, { "epoch": 0.9708052013003251, "grad_norm": 0.8564503788948059, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15529, "tokens_per_second_per_gpu": 11379.63, "total_tokens": 1533056231 }, { "epoch": 0.9708677169292323, "grad_norm": 0.8780608177185059, "learning_rate": 2e-05, "loss": 0.5937, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15530, "tokens_per_second_per_gpu": 10834.68, "total_tokens": 1533154948 }, { "epoch": 0.9709302325581395, "grad_norm": 0.9130641222000122, "learning_rate": 2e-05, "loss": 0.6425, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15531, "tokens_per_second_per_gpu": 10602.26, "total_tokens": 1533251666 }, { "epoch": 0.9709927481870467, "grad_norm": 0.9186538457870483, "learning_rate": 2e-05, "loss": 0.6044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15532, "tokens_per_second_per_gpu": 9926.6, "total_tokens": 1533348280 }, { "epoch": 0.971055263815954, "grad_norm": 0.9405791163444519, "learning_rate": 2e-05, "loss": 0.6366, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15533, "tokens_per_second_per_gpu": 10578.42, "total_tokens": 1533448449 }, { "epoch": 0.9711177794448612, "grad_norm": 0.9541135430335999, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15534, "tokens_per_second_per_gpu": 9002.38, "total_tokens": 1533539153 }, { "epoch": 0.9711802950737685, "grad_norm": 0.9068172574043274, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15535, "tokens_per_second_per_gpu": 10488.68, "total_tokens": 1533637355 }, { "epoch": 0.9712428107026757, "grad_norm": 0.9254809617996216, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15536, "tokens_per_second_per_gpu": 9729.34, "total_tokens": 1533732969 }, { "epoch": 0.9713053263315828, "grad_norm": 0.9056990742683411, "learning_rate": 2e-05, "loss": 0.6192, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15537, "tokens_per_second_per_gpu": 10912.06, "total_tokens": 1533831190 }, { "epoch": 0.9713678419604901, "grad_norm": 0.9069780707359314, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15538, "tokens_per_second_per_gpu": 10866.5, "total_tokens": 1533929840 }, { "epoch": 0.9714303575893973, "grad_norm": 0.9026123285293579, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15539, "tokens_per_second_per_gpu": 10272.77, "total_tokens": 1534029272 }, { "epoch": 0.9714928732183046, "grad_norm": 0.8757830262184143, "learning_rate": 2e-05, "loss": 0.6191, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15540, "tokens_per_second_per_gpu": 10682.85, "total_tokens": 1534127909 }, { "epoch": 0.9715553888472118, "grad_norm": 0.9001328945159912, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15541, "tokens_per_second_per_gpu": 10483.52, "total_tokens": 1534220000 }, { "epoch": 0.9716179044761191, "grad_norm": 0.8640457987785339, "learning_rate": 2e-05, "loss": 0.6135, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15542, "tokens_per_second_per_gpu": 10575.25, "total_tokens": 1534321033 }, { "epoch": 0.9716804201050263, "grad_norm": 0.8841169476509094, "learning_rate": 2e-05, "loss": 0.5856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15543, "tokens_per_second_per_gpu": 10851.42, "total_tokens": 1534416875 }, { "epoch": 0.9717429357339334, "grad_norm": 0.8857172131538391, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15544, "tokens_per_second_per_gpu": 10596.14, "total_tokens": 1534514210 }, { "epoch": 0.9718054513628407, "grad_norm": 0.9022918343544006, "learning_rate": 2e-05, "loss": 0.6346, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15545, "tokens_per_second_per_gpu": 10687.49, "total_tokens": 1534613182 }, { "epoch": 0.9718679669917479, "grad_norm": 0.9730547666549683, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15546, "tokens_per_second_per_gpu": 10530.55, "total_tokens": 1534707790 }, { "epoch": 0.9719304826206552, "grad_norm": 0.8914802670478821, "learning_rate": 2e-05, "loss": 0.6341, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15547, "tokens_per_second_per_gpu": 10881.57, "total_tokens": 1534806655 }, { "epoch": 0.9719929982495624, "grad_norm": 0.9295873045921326, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15548, "tokens_per_second_per_gpu": 11218.9, "total_tokens": 1534901331 }, { "epoch": 0.9720555138784697, "grad_norm": 0.9076344966888428, "learning_rate": 2e-05, "loss": 0.6213, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15549, "tokens_per_second_per_gpu": 10523.21, "total_tokens": 1535001713 }, { "epoch": 0.9721180295073768, "grad_norm": 0.8819646239280701, "learning_rate": 2e-05, "loss": 0.6359, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15550, "tokens_per_second_per_gpu": 11189.53, "total_tokens": 1535099410 }, { "epoch": 0.972180545136284, "grad_norm": 0.9049898386001587, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15551, "tokens_per_second_per_gpu": 10153.2, "total_tokens": 1535196001 }, { "epoch": 0.9722430607651913, "grad_norm": 0.8827574253082275, "learning_rate": 2e-05, "loss": 0.6111, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15552, "tokens_per_second_per_gpu": 10285.88, "total_tokens": 1535293356 }, { "epoch": 0.9723055763940985, "grad_norm": 0.9216961860656738, "learning_rate": 2e-05, "loss": 0.6288, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15553, "tokens_per_second_per_gpu": 9994.36, "total_tokens": 1535389250 }, { "epoch": 0.9723680920230058, "grad_norm": 0.9048560261726379, "learning_rate": 2e-05, "loss": 0.629, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15554, "tokens_per_second_per_gpu": 9963.08, "total_tokens": 1535486196 }, { "epoch": 0.972430607651913, "grad_norm": 0.8841418027877808, "learning_rate": 2e-05, "loss": 0.6303, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15555, "tokens_per_second_per_gpu": 10933.45, "total_tokens": 1535586833 }, { "epoch": 0.9724931232808202, "grad_norm": 0.9385847449302673, "learning_rate": 2e-05, "loss": 0.6152, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15556, "tokens_per_second_per_gpu": 10515.78, "total_tokens": 1535683388 }, { "epoch": 0.9725556389097274, "grad_norm": 0.9074570536613464, "learning_rate": 2e-05, "loss": 0.618, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15557, "tokens_per_second_per_gpu": 11068.85, "total_tokens": 1535782066 }, { "epoch": 0.9726181545386346, "grad_norm": 0.9038183093070984, "learning_rate": 2e-05, "loss": 0.6065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15558, "tokens_per_second_per_gpu": 10406.55, "total_tokens": 1535878863 }, { "epoch": 0.9726806701675419, "grad_norm": 0.8950600624084473, "learning_rate": 2e-05, "loss": 0.6387, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15559, "tokens_per_second_per_gpu": 10483.49, "total_tokens": 1535981250 }, { "epoch": 0.9727431857964491, "grad_norm": 0.8701111078262329, "learning_rate": 2e-05, "loss": 0.5918, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15560, "tokens_per_second_per_gpu": 10304.96, "total_tokens": 1536078177 }, { "epoch": 0.9728057014253564, "grad_norm": 0.9040225148200989, "learning_rate": 2e-05, "loss": 0.6, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15561, "tokens_per_second_per_gpu": 10625.88, "total_tokens": 1536175521 }, { "epoch": 0.9728682170542635, "grad_norm": 0.9191792607307434, "learning_rate": 2e-05, "loss": 0.6285, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15562, "tokens_per_second_per_gpu": 10571.97, "total_tokens": 1536270783 }, { "epoch": 0.9729307326831708, "grad_norm": 0.8842824101448059, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15563, "tokens_per_second_per_gpu": 10326.84, "total_tokens": 1536366158 }, { "epoch": 0.972993248312078, "grad_norm": 0.8733096122741699, "learning_rate": 2e-05, "loss": 0.5624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15564, "tokens_per_second_per_gpu": 10119.1, "total_tokens": 1536462679 }, { "epoch": 0.9730557639409853, "grad_norm": 0.950420081615448, "learning_rate": 2e-05, "loss": 0.6488, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15565, "tokens_per_second_per_gpu": 9829.24, "total_tokens": 1536555753 }, { "epoch": 0.9731182795698925, "grad_norm": 0.896772563457489, "learning_rate": 2e-05, "loss": 0.6245, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15566, "tokens_per_second_per_gpu": 10496.19, "total_tokens": 1536654854 }, { "epoch": 0.9731807951987997, "grad_norm": 0.8866901397705078, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15567, "tokens_per_second_per_gpu": 10665.91, "total_tokens": 1536748621 }, { "epoch": 0.9732433108277069, "grad_norm": 0.9095102548599243, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15568, "tokens_per_second_per_gpu": 9644.68, "total_tokens": 1536842661 }, { "epoch": 0.9733058264566141, "grad_norm": 0.9121004939079285, "learning_rate": 2e-05, "loss": 0.5784, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15569, "tokens_per_second_per_gpu": 9936.49, "total_tokens": 1536932438 }, { "epoch": 0.9733683420855214, "grad_norm": 0.9278334975242615, "learning_rate": 2e-05, "loss": 0.64, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15570, "tokens_per_second_per_gpu": 10415.05, "total_tokens": 1537028458 }, { "epoch": 0.9734308577144286, "grad_norm": 0.905604898929596, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15571, "tokens_per_second_per_gpu": 10453.9, "total_tokens": 1537125982 }, { "epoch": 0.9734933733433359, "grad_norm": 0.8947100639343262, "learning_rate": 2e-05, "loss": 0.5783, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15572, "tokens_per_second_per_gpu": 9839.71, "total_tokens": 1537220057 }, { "epoch": 0.9735558889722431, "grad_norm": 0.9246360063552856, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15573, "tokens_per_second_per_gpu": 9695.29, "total_tokens": 1537314353 }, { "epoch": 0.9736184046011502, "grad_norm": 0.9770556688308716, "learning_rate": 2e-05, "loss": 0.6494, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15574, "tokens_per_second_per_gpu": 10664.93, "total_tokens": 1537416662 }, { "epoch": 0.9736809202300575, "grad_norm": 0.8851789832115173, "learning_rate": 2e-05, "loss": 0.6389, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15575, "tokens_per_second_per_gpu": 11482.37, "total_tokens": 1537515181 }, { "epoch": 0.9737434358589647, "grad_norm": 0.9401749968528748, "learning_rate": 2e-05, "loss": 0.6757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15576, "tokens_per_second_per_gpu": 10840.14, "total_tokens": 1537613416 }, { "epoch": 0.973805951487872, "grad_norm": 0.9479246139526367, "learning_rate": 2e-05, "loss": 0.5551, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15577, "tokens_per_second_per_gpu": 11198.4, "total_tokens": 1537711966 }, { "epoch": 0.9738684671167792, "grad_norm": 0.9138476848602295, "learning_rate": 2e-05, "loss": 0.6006, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15578, "tokens_per_second_per_gpu": 10177.85, "total_tokens": 1537810070 }, { "epoch": 0.9739309827456865, "grad_norm": 0.881169855594635, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15579, "tokens_per_second_per_gpu": 10478.4, "total_tokens": 1537912065 }, { "epoch": 0.9739934983745937, "grad_norm": 0.9042054414749146, "learning_rate": 2e-05, "loss": 0.6701, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15580, "tokens_per_second_per_gpu": 11080.73, "total_tokens": 1538015588 }, { "epoch": 0.9740560140035008, "grad_norm": 0.9113815426826477, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15581, "tokens_per_second_per_gpu": 11285.55, "total_tokens": 1538118137 }, { "epoch": 0.9741185296324081, "grad_norm": 0.9025397300720215, "learning_rate": 2e-05, "loss": 0.5815, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15582, "tokens_per_second_per_gpu": 11162.92, "total_tokens": 1538215323 }, { "epoch": 0.9741810452613153, "grad_norm": 0.9316595792770386, "learning_rate": 2e-05, "loss": 0.6147, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15583, "tokens_per_second_per_gpu": 10132.81, "total_tokens": 1538312672 }, { "epoch": 0.9742435608902226, "grad_norm": 0.9128536581993103, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15584, "tokens_per_second_per_gpu": 10605.29, "total_tokens": 1538412297 }, { "epoch": 0.9743060765191298, "grad_norm": 0.8885732293128967, "learning_rate": 2e-05, "loss": 0.6271, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15585, "tokens_per_second_per_gpu": 10278.73, "total_tokens": 1538509598 }, { "epoch": 0.9743685921480371, "grad_norm": 0.8614484667778015, "learning_rate": 2e-05, "loss": 0.5987, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15586, "tokens_per_second_per_gpu": 10751.48, "total_tokens": 1538609874 }, { "epoch": 0.9744311077769442, "grad_norm": 0.859469473361969, "learning_rate": 2e-05, "loss": 0.6118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15587, "tokens_per_second_per_gpu": 10576.89, "total_tokens": 1538709225 }, { "epoch": 0.9744936234058514, "grad_norm": 0.8849117159843445, "learning_rate": 2e-05, "loss": 0.5544, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15588, "tokens_per_second_per_gpu": 10043.11, "total_tokens": 1538802688 }, { "epoch": 0.9745561390347587, "grad_norm": 0.9029787182807922, "learning_rate": 2e-05, "loss": 0.6332, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15589, "tokens_per_second_per_gpu": 10423.68, "total_tokens": 1538903224 }, { "epoch": 0.9746186546636659, "grad_norm": 0.8839531540870667, "learning_rate": 2e-05, "loss": 0.6233, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15590, "tokens_per_second_per_gpu": 10772.97, "total_tokens": 1539006577 }, { "epoch": 0.9746811702925732, "grad_norm": 0.8874107599258423, "learning_rate": 2e-05, "loss": 0.5637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15591, "tokens_per_second_per_gpu": 9770.87, "total_tokens": 1539098467 }, { "epoch": 0.9747436859214804, "grad_norm": 0.9038136601448059, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15592, "tokens_per_second_per_gpu": 10085.31, "total_tokens": 1539192808 }, { "epoch": 0.9748062015503876, "grad_norm": 0.8685385584831238, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15593, "tokens_per_second_per_gpu": 11384.23, "total_tokens": 1539295112 }, { "epoch": 0.9748687171792948, "grad_norm": 0.8656389117240906, "learning_rate": 2e-05, "loss": 0.6261, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15594, "tokens_per_second_per_gpu": 11226.01, "total_tokens": 1539399016 }, { "epoch": 0.974931232808202, "grad_norm": 0.904269278049469, "learning_rate": 2e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15595, "tokens_per_second_per_gpu": 10495.86, "total_tokens": 1539496985 }, { "epoch": 0.9749937484371093, "grad_norm": 0.928803563117981, "learning_rate": 2e-05, "loss": 0.5765, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15596, "tokens_per_second_per_gpu": 11026.97, "total_tokens": 1539592483 }, { "epoch": 0.9750562640660165, "grad_norm": 0.8642646670341492, "learning_rate": 2e-05, "loss": 0.5938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15597, "tokens_per_second_per_gpu": 10779.92, "total_tokens": 1539691067 }, { "epoch": 0.9751187796949238, "grad_norm": 1.0367164611816406, "learning_rate": 2e-05, "loss": 0.5652, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15598, "tokens_per_second_per_gpu": 10158.71, "total_tokens": 1539783066 }, { "epoch": 0.9751812953238309, "grad_norm": 0.8856890201568604, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15599, "tokens_per_second_per_gpu": 10139.48, "total_tokens": 1539884015 }, { "epoch": 0.9752438109527382, "grad_norm": 0.8815948963165283, "learning_rate": 2e-05, "loss": 0.6451, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15600, "tokens_per_second_per_gpu": 10999.64, "total_tokens": 1539986983 }, { "epoch": 0.9753063265816454, "grad_norm": 0.8622372150421143, "learning_rate": 2e-05, "loss": 0.6162, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15601, "tokens_per_second_per_gpu": 10737.15, "total_tokens": 1540084453 }, { "epoch": 0.9753688422105526, "grad_norm": 0.9567110538482666, "learning_rate": 2e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15602, "tokens_per_second_per_gpu": 9500.53, "total_tokens": 1540173138 }, { "epoch": 0.9754313578394599, "grad_norm": 0.8965917229652405, "learning_rate": 2e-05, "loss": 0.6001, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15603, "tokens_per_second_per_gpu": 10609.32, "total_tokens": 1540275767 }, { "epoch": 0.9754938734683671, "grad_norm": 0.8726935386657715, "learning_rate": 2e-05, "loss": 0.5725, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15604, "tokens_per_second_per_gpu": 10354.22, "total_tokens": 1540376934 }, { "epoch": 0.9755563890972743, "grad_norm": 0.8610224723815918, "learning_rate": 2e-05, "loss": 0.649, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15605, "tokens_per_second_per_gpu": 10917.63, "total_tokens": 1540479514 }, { "epoch": 0.9756189047261815, "grad_norm": 0.9021724462509155, "learning_rate": 2e-05, "loss": 0.603, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15606, "tokens_per_second_per_gpu": 10808.37, "total_tokens": 1540575394 }, { "epoch": 0.9756814203550888, "grad_norm": 0.8714771270751953, "learning_rate": 2e-05, "loss": 0.6073, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15607, "tokens_per_second_per_gpu": 10908.9, "total_tokens": 1540673448 }, { "epoch": 0.975743935983996, "grad_norm": 0.8311195373535156, "learning_rate": 2e-05, "loss": 0.5843, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15608, "tokens_per_second_per_gpu": 11210.46, "total_tokens": 1540776541 }, { "epoch": 0.9758064516129032, "grad_norm": 0.8614290356636047, "learning_rate": 2e-05, "loss": 0.5947, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15609, "tokens_per_second_per_gpu": 10686.51, "total_tokens": 1540873176 }, { "epoch": 0.9758689672418105, "grad_norm": 0.8828797340393066, "learning_rate": 2e-05, "loss": 0.6128, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15610, "tokens_per_second_per_gpu": 11026.99, "total_tokens": 1540976929 }, { "epoch": 0.9759314828707176, "grad_norm": 0.8823716640472412, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15611, "tokens_per_second_per_gpu": 10833.07, "total_tokens": 1541077200 }, { "epoch": 0.9759939984996249, "grad_norm": 0.910293459892273, "learning_rate": 2e-05, "loss": 0.6167, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15612, "tokens_per_second_per_gpu": 10543.83, "total_tokens": 1541177978 }, { "epoch": 0.9760565141285321, "grad_norm": 0.8818903565406799, "learning_rate": 2e-05, "loss": 0.5967, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15613, "tokens_per_second_per_gpu": 10414.4, "total_tokens": 1541274769 }, { "epoch": 0.9761190297574394, "grad_norm": 0.8996086716651917, "learning_rate": 2e-05, "loss": 0.6077, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15614, "tokens_per_second_per_gpu": 10258.41, "total_tokens": 1541372370 }, { "epoch": 0.9761815453863466, "grad_norm": 0.8867799639701843, "learning_rate": 2e-05, "loss": 0.6583, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15615, "tokens_per_second_per_gpu": 10808.88, "total_tokens": 1541473607 }, { "epoch": 0.9762440610152539, "grad_norm": 0.8648610711097717, "learning_rate": 2e-05, "loss": 0.5912, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15616, "tokens_per_second_per_gpu": 10503.36, "total_tokens": 1541571686 }, { "epoch": 0.9763065766441611, "grad_norm": 0.9465040564537048, "learning_rate": 2e-05, "loss": 0.6375, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15617, "tokens_per_second_per_gpu": 10082.25, "total_tokens": 1541666290 }, { "epoch": 0.9763690922730682, "grad_norm": 0.8940884470939636, "learning_rate": 2e-05, "loss": 0.5894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15618, "tokens_per_second_per_gpu": 10784.85, "total_tokens": 1541762849 }, { "epoch": 0.9764316079019755, "grad_norm": 0.8599560856819153, "learning_rate": 2e-05, "loss": 0.5679, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15619, "tokens_per_second_per_gpu": 9694.48, "total_tokens": 1541856347 }, { "epoch": 0.9764941235308827, "grad_norm": 0.8628836274147034, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15620, "tokens_per_second_per_gpu": 10657.71, "total_tokens": 1541958587 }, { "epoch": 0.97655663915979, "grad_norm": 0.8560936450958252, "learning_rate": 2e-05, "loss": 0.6171, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15621, "tokens_per_second_per_gpu": 11589.29, "total_tokens": 1542064158 }, { "epoch": 0.9766191547886972, "grad_norm": 0.8884732723236084, "learning_rate": 2e-05, "loss": 0.5919, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15622, "tokens_per_second_per_gpu": 10184.12, "total_tokens": 1542162817 }, { "epoch": 0.9766816704176045, "grad_norm": 0.8700750470161438, "learning_rate": 2e-05, "loss": 0.6576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15623, "tokens_per_second_per_gpu": 10688.1, "total_tokens": 1542264737 }, { "epoch": 0.9767441860465116, "grad_norm": 0.9121853113174438, "learning_rate": 2e-05, "loss": 0.624, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15624, "tokens_per_second_per_gpu": 11334.89, "total_tokens": 1542369221 }, { "epoch": 0.9768067016754188, "grad_norm": 0.8739206194877625, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15625, "tokens_per_second_per_gpu": 10170.03, "total_tokens": 1542465182 }, { "epoch": 0.9768692173043261, "grad_norm": 0.8723366856575012, "learning_rate": 2e-05, "loss": 0.6179, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15626, "tokens_per_second_per_gpu": 10887.85, "total_tokens": 1542566023 }, { "epoch": 0.9769317329332333, "grad_norm": 0.8769696354866028, "learning_rate": 2e-05, "loss": 0.5698, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15627, "tokens_per_second_per_gpu": 9920.36, "total_tokens": 1542661388 }, { "epoch": 0.9769942485621406, "grad_norm": 0.921683132648468, "learning_rate": 2e-05, "loss": 0.6632, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15628, "tokens_per_second_per_gpu": 10410.72, "total_tokens": 1542765268 }, { "epoch": 0.9770567641910478, "grad_norm": 0.8725969195365906, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15629, "tokens_per_second_per_gpu": 10777.42, "total_tokens": 1542868633 }, { "epoch": 0.977119279819955, "grad_norm": 0.9570024013519287, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15630, "tokens_per_second_per_gpu": 10528.69, "total_tokens": 1542967962 }, { "epoch": 0.9771817954488622, "grad_norm": 0.8999803066253662, "learning_rate": 2e-05, "loss": 0.6684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15631, "tokens_per_second_per_gpu": 10747.98, "total_tokens": 1543066687 }, { "epoch": 0.9772443110777694, "grad_norm": 0.868233859539032, "learning_rate": 2e-05, "loss": 0.6047, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15632, "tokens_per_second_per_gpu": 10680.19, "total_tokens": 1543165685 }, { "epoch": 0.9773068267066767, "grad_norm": 0.8861594796180725, "learning_rate": 2e-05, "loss": 0.5985, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15633, "tokens_per_second_per_gpu": 10182.21, "total_tokens": 1543262844 }, { "epoch": 0.9773693423355839, "grad_norm": 0.8708881139755249, "learning_rate": 2e-05, "loss": 0.6127, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15634, "tokens_per_second_per_gpu": 10868.4, "total_tokens": 1543363246 }, { "epoch": 0.9774318579644912, "grad_norm": 0.9001837968826294, "learning_rate": 2e-05, "loss": 0.5856, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15635, "tokens_per_second_per_gpu": 9873.79, "total_tokens": 1543457182 }, { "epoch": 0.9774943735933983, "grad_norm": 0.852364182472229, "learning_rate": 2e-05, "loss": 0.5857, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15636, "tokens_per_second_per_gpu": 10934.71, "total_tokens": 1543559429 }, { "epoch": 0.9775568892223055, "grad_norm": 0.8588711023330688, "learning_rate": 2e-05, "loss": 0.5812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15637, "tokens_per_second_per_gpu": 10902.46, "total_tokens": 1543659087 }, { "epoch": 0.9776194048512128, "grad_norm": 0.8967283368110657, "learning_rate": 2e-05, "loss": 0.6018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15638, "tokens_per_second_per_gpu": 10718.75, "total_tokens": 1543758611 }, { "epoch": 0.97768192048012, "grad_norm": 0.903782844543457, "learning_rate": 2e-05, "loss": 0.5945, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15639, "tokens_per_second_per_gpu": 10964.78, "total_tokens": 1543859470 }, { "epoch": 0.9777444361090273, "grad_norm": 0.9064813852310181, "learning_rate": 2e-05, "loss": 0.572, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15640, "tokens_per_second_per_gpu": 9252.61, "total_tokens": 1543952034 }, { "epoch": 0.9778069517379345, "grad_norm": 0.8807922601699829, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15641, "tokens_per_second_per_gpu": 10869.64, "total_tokens": 1544054061 }, { "epoch": 0.9778694673668417, "grad_norm": 0.8918820023536682, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15642, "tokens_per_second_per_gpu": 10301.76, "total_tokens": 1544153268 }, { "epoch": 0.9779319829957489, "grad_norm": 0.9055427312850952, "learning_rate": 2e-05, "loss": 0.5942, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15643, "tokens_per_second_per_gpu": 10181.11, "total_tokens": 1544244503 }, { "epoch": 0.9779944986246562, "grad_norm": 0.8906503915786743, "learning_rate": 2e-05, "loss": 0.5713, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15644, "tokens_per_second_per_gpu": 10844.43, "total_tokens": 1544339985 }, { "epoch": 0.9780570142535634, "grad_norm": 0.8997369408607483, "learning_rate": 2e-05, "loss": 0.5661, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15645, "tokens_per_second_per_gpu": 10752.99, "total_tokens": 1544436670 }, { "epoch": 0.9781195298824706, "grad_norm": 0.9726508855819702, "learning_rate": 2e-05, "loss": 0.6024, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15646, "tokens_per_second_per_gpu": 9204.27, "total_tokens": 1544525752 }, { "epoch": 0.9781820455113779, "grad_norm": 0.8802331686019897, "learning_rate": 2e-05, "loss": 0.6265, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15647, "tokens_per_second_per_gpu": 10689.95, "total_tokens": 1544621440 }, { "epoch": 0.978244561140285, "grad_norm": 0.9170297384262085, "learning_rate": 2e-05, "loss": 0.6034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15648, "tokens_per_second_per_gpu": 10676.02, "total_tokens": 1544720386 }, { "epoch": 0.9783070767691923, "grad_norm": 0.851495087146759, "learning_rate": 2e-05, "loss": 0.5496, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15649, "tokens_per_second_per_gpu": 10049.66, "total_tokens": 1544813593 }, { "epoch": 0.9783695923980995, "grad_norm": 0.8976171612739563, "learning_rate": 2e-05, "loss": 0.5996, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15650, "tokens_per_second_per_gpu": 9724.66, "total_tokens": 1544909531 }, { "epoch": 0.9784321080270068, "grad_norm": 0.8947418928146362, "learning_rate": 2e-05, "loss": 0.6244, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15651, "tokens_per_second_per_gpu": 11091.72, "total_tokens": 1545011712 }, { "epoch": 0.978494623655914, "grad_norm": 0.8729733228683472, "learning_rate": 2e-05, "loss": 0.633, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15652, "tokens_per_second_per_gpu": 10617.97, "total_tokens": 1545111277 }, { "epoch": 0.9785571392848212, "grad_norm": 0.8856154680252075, "learning_rate": 2e-05, "loss": 0.621, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15653, "tokens_per_second_per_gpu": 10175.35, "total_tokens": 1545208195 }, { "epoch": 0.9786196549137285, "grad_norm": 0.9246373772621155, "learning_rate": 2e-05, "loss": 0.5784, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15654, "tokens_per_second_per_gpu": 9764.69, "total_tokens": 1545301312 }, { "epoch": 0.9786821705426356, "grad_norm": 0.9208111763000488, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15655, "tokens_per_second_per_gpu": 10487.57, "total_tokens": 1545397559 }, { "epoch": 0.9787446861715429, "grad_norm": 0.8991165161132812, "learning_rate": 2e-05, "loss": 0.5813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15656, "tokens_per_second_per_gpu": 11050.02, "total_tokens": 1545494999 }, { "epoch": 0.9788072018004501, "grad_norm": 0.8732203841209412, "learning_rate": 2e-05, "loss": 0.5631, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15657, "tokens_per_second_per_gpu": 9925.23, "total_tokens": 1545592086 }, { "epoch": 0.9788697174293574, "grad_norm": 0.9074670076370239, "learning_rate": 2e-05, "loss": 0.6079, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15658, "tokens_per_second_per_gpu": 10052.32, "total_tokens": 1545686265 }, { "epoch": 0.9789322330582646, "grad_norm": 0.8882374167442322, "learning_rate": 2e-05, "loss": 0.5877, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15659, "tokens_per_second_per_gpu": 10346.01, "total_tokens": 1545786147 }, { "epoch": 0.9789947486871718, "grad_norm": 0.89541095495224, "learning_rate": 2e-05, "loss": 0.6028, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15660, "tokens_per_second_per_gpu": 10995.34, "total_tokens": 1545887710 }, { "epoch": 0.979057264316079, "grad_norm": 0.8560125827789307, "learning_rate": 2e-05, "loss": 0.6189, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15661, "tokens_per_second_per_gpu": 10954.94, "total_tokens": 1545992479 }, { "epoch": 0.9791197799449862, "grad_norm": 0.9055610299110413, "learning_rate": 2e-05, "loss": 0.5878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15662, "tokens_per_second_per_gpu": 10514.3, "total_tokens": 1546091486 }, { "epoch": 0.9791822955738935, "grad_norm": 0.9132282733917236, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15663, "tokens_per_second_per_gpu": 10861.7, "total_tokens": 1546189655 }, { "epoch": 0.9792448112028007, "grad_norm": 0.9140270948410034, "learning_rate": 2e-05, "loss": 0.5899, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15664, "tokens_per_second_per_gpu": 10573.14, "total_tokens": 1546286096 }, { "epoch": 0.979307326831708, "grad_norm": 0.8812698721885681, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15665, "tokens_per_second_per_gpu": 10422.71, "total_tokens": 1546387939 }, { "epoch": 0.9793698424606152, "grad_norm": 0.9200754165649414, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15666, "tokens_per_second_per_gpu": 10917.55, "total_tokens": 1546487928 }, { "epoch": 0.9794323580895223, "grad_norm": 0.9312268495559692, "learning_rate": 2e-05, "loss": 0.6094, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15667, "tokens_per_second_per_gpu": 10527.0, "total_tokens": 1546584610 }, { "epoch": 0.9794948737184296, "grad_norm": 0.8655555248260498, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15668, "tokens_per_second_per_gpu": 10786.3, "total_tokens": 1546685020 }, { "epoch": 0.9795573893473368, "grad_norm": 0.8698891997337341, "learning_rate": 2e-05, "loss": 0.616, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15669, "tokens_per_second_per_gpu": 11545.18, "total_tokens": 1546786087 }, { "epoch": 0.9796199049762441, "grad_norm": 0.9600833058357239, "learning_rate": 2e-05, "loss": 0.6398, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15670, "tokens_per_second_per_gpu": 10607.0, "total_tokens": 1546885635 }, { "epoch": 0.9796824206051513, "grad_norm": 0.8815635442733765, "learning_rate": 2e-05, "loss": 0.5835, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15671, "tokens_per_second_per_gpu": 10013.15, "total_tokens": 1546981355 }, { "epoch": 0.9797449362340586, "grad_norm": 0.8520796298980713, "learning_rate": 2e-05, "loss": 0.5735, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15672, "tokens_per_second_per_gpu": 10729.57, "total_tokens": 1547082863 }, { "epoch": 0.9798074518629657, "grad_norm": 0.9026328325271606, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15673, "tokens_per_second_per_gpu": 10149.85, "total_tokens": 1547181289 }, { "epoch": 0.9798699674918729, "grad_norm": 0.8739302158355713, "learning_rate": 2e-05, "loss": 0.6151, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15674, "tokens_per_second_per_gpu": 10797.34, "total_tokens": 1547282565 }, { "epoch": 0.9799324831207802, "grad_norm": 0.908128023147583, "learning_rate": 2e-05, "loss": 0.6407, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15675, "tokens_per_second_per_gpu": 11319.65, "total_tokens": 1547386177 }, { "epoch": 0.9799949987496874, "grad_norm": 0.919609546661377, "learning_rate": 2e-05, "loss": 0.6284, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15676, "tokens_per_second_per_gpu": 10099.6, "total_tokens": 1547482571 }, { "epoch": 0.9800575143785947, "grad_norm": 0.8719038963317871, "learning_rate": 2e-05, "loss": 0.6257, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15677, "tokens_per_second_per_gpu": 10707.29, "total_tokens": 1547582097 }, { "epoch": 0.9801200300075019, "grad_norm": 0.9216874241828918, "learning_rate": 2e-05, "loss": 0.6337, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15678, "tokens_per_second_per_gpu": 10929.48, "total_tokens": 1547677356 }, { "epoch": 0.980182545636409, "grad_norm": 0.9069056510925293, "learning_rate": 2e-05, "loss": 0.6417, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15679, "tokens_per_second_per_gpu": 10574.37, "total_tokens": 1547777193 }, { "epoch": 0.9802450612653163, "grad_norm": 0.8998289704322815, "learning_rate": 2e-05, "loss": 0.6183, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15680, "tokens_per_second_per_gpu": 10041.01, "total_tokens": 1547873506 }, { "epoch": 0.9803075768942235, "grad_norm": 0.8772315979003906, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15681, "tokens_per_second_per_gpu": 10044.67, "total_tokens": 1547974861 }, { "epoch": 0.9803700925231308, "grad_norm": 0.848381757736206, "learning_rate": 2e-05, "loss": 0.6055, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15682, "tokens_per_second_per_gpu": 10785.88, "total_tokens": 1548075291 }, { "epoch": 0.980432608152038, "grad_norm": 0.8969103097915649, "learning_rate": 2e-05, "loss": 0.5928, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15683, "tokens_per_second_per_gpu": 10423.41, "total_tokens": 1548172841 }, { "epoch": 0.9804951237809453, "grad_norm": 0.925014317035675, "learning_rate": 2e-05, "loss": 0.6504, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15684, "tokens_per_second_per_gpu": 9971.46, "total_tokens": 1548270623 }, { "epoch": 0.9805576394098524, "grad_norm": 0.8793136477470398, "learning_rate": 2e-05, "loss": 0.6356, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15685, "tokens_per_second_per_gpu": 11270.68, "total_tokens": 1548375475 }, { "epoch": 0.9806201550387597, "grad_norm": 0.9309556484222412, "learning_rate": 2e-05, "loss": 0.6044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15686, "tokens_per_second_per_gpu": 10886.3, "total_tokens": 1548469710 }, { "epoch": 0.9806826706676669, "grad_norm": 0.9550617337226868, "learning_rate": 2e-05, "loss": 0.5913, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15687, "tokens_per_second_per_gpu": 9738.68, "total_tokens": 1548567021 }, { "epoch": 0.9807451862965741, "grad_norm": 0.8894713521003723, "learning_rate": 2e-05, "loss": 0.6076, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15688, "tokens_per_second_per_gpu": 9804.77, "total_tokens": 1548662718 }, { "epoch": 0.9808077019254814, "grad_norm": 0.928002655506134, "learning_rate": 2e-05, "loss": 0.5812, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15689, "tokens_per_second_per_gpu": 9656.35, "total_tokens": 1548757171 }, { "epoch": 0.9808702175543886, "grad_norm": 0.8891594409942627, "learning_rate": 2e-05, "loss": 0.6412, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15690, "tokens_per_second_per_gpu": 11442.41, "total_tokens": 1548854441 }, { "epoch": 0.9809327331832958, "grad_norm": 0.8880210518836975, "learning_rate": 2e-05, "loss": 0.6014, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15691, "tokens_per_second_per_gpu": 10732.98, "total_tokens": 1548950903 }, { "epoch": 0.980995248812203, "grad_norm": 0.8817471265792847, "learning_rate": 2e-05, "loss": 0.5685, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15692, "tokens_per_second_per_gpu": 10760.75, "total_tokens": 1549044864 }, { "epoch": 0.9810577644411103, "grad_norm": 0.8857513666152954, "learning_rate": 2e-05, "loss": 0.5704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15693, "tokens_per_second_per_gpu": 9604.92, "total_tokens": 1549137083 }, { "epoch": 0.9811202800700175, "grad_norm": 0.8719456791877747, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15694, "tokens_per_second_per_gpu": 10878.76, "total_tokens": 1549239581 }, { "epoch": 0.9811827956989247, "grad_norm": 0.8442271947860718, "learning_rate": 2e-05, "loss": 0.588, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15695, "tokens_per_second_per_gpu": 10357.49, "total_tokens": 1549338549 }, { "epoch": 0.981245311327832, "grad_norm": 0.8761268854141235, "learning_rate": 2e-05, "loss": 0.5974, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15696, "tokens_per_second_per_gpu": 10194.69, "total_tokens": 1549438256 }, { "epoch": 0.9813078269567392, "grad_norm": 0.884234607219696, "learning_rate": 2e-05, "loss": 0.5707, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15697, "tokens_per_second_per_gpu": 10629.6, "total_tokens": 1549536284 }, { "epoch": 0.9813703425856464, "grad_norm": 0.9236898422241211, "learning_rate": 2e-05, "loss": 0.6188, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15698, "tokens_per_second_per_gpu": 10510.67, "total_tokens": 1549636554 }, { "epoch": 0.9814328582145536, "grad_norm": 0.9082075953483582, "learning_rate": 2e-05, "loss": 0.6354, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15699, "tokens_per_second_per_gpu": 9899.84, "total_tokens": 1549735912 }, { "epoch": 0.9814953738434609, "grad_norm": 0.9379040002822876, "learning_rate": 2e-05, "loss": 0.6196, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15700, "tokens_per_second_per_gpu": 9850.61, "total_tokens": 1549832619 }, { "epoch": 0.9815578894723681, "grad_norm": 0.8702616691589355, "learning_rate": 2e-05, "loss": 0.639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15701, "tokens_per_second_per_gpu": 10641.12, "total_tokens": 1549932001 }, { "epoch": 0.9816204051012754, "grad_norm": 0.846514105796814, "learning_rate": 2e-05, "loss": 0.589, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15702, "tokens_per_second_per_gpu": 10828.7, "total_tokens": 1550032777 }, { "epoch": 0.9816829207301826, "grad_norm": 0.9034148454666138, "learning_rate": 2e-05, "loss": 0.5639, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15703, "tokens_per_second_per_gpu": 9874.2, "total_tokens": 1550128020 }, { "epoch": 0.9817454363590897, "grad_norm": 0.8816498517990112, "learning_rate": 2e-05, "loss": 0.6326, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15704, "tokens_per_second_per_gpu": 10528.43, "total_tokens": 1550226311 }, { "epoch": 0.981807951987997, "grad_norm": 0.9204788208007812, "learning_rate": 2e-05, "loss": 0.5893, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15705, "tokens_per_second_per_gpu": 9941.04, "total_tokens": 1550318042 }, { "epoch": 0.9818704676169042, "grad_norm": 0.886669397354126, "learning_rate": 2e-05, "loss": 0.5704, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15706, "tokens_per_second_per_gpu": 9910.37, "total_tokens": 1550412859 }, { "epoch": 0.9819329832458115, "grad_norm": 0.82452791929245, "learning_rate": 2e-05, "loss": 0.5833, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15707, "tokens_per_second_per_gpu": 10437.68, "total_tokens": 1550510743 }, { "epoch": 0.9819954988747187, "grad_norm": 0.8710805177688599, "learning_rate": 2e-05, "loss": 0.6149, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15708, "tokens_per_second_per_gpu": 10529.58, "total_tokens": 1550609562 }, { "epoch": 0.982058014503626, "grad_norm": 0.8439096212387085, "learning_rate": 2e-05, "loss": 0.5617, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15709, "tokens_per_second_per_gpu": 10989.25, "total_tokens": 1550710807 }, { "epoch": 0.9821205301325331, "grad_norm": 0.9056156873703003, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15710, "tokens_per_second_per_gpu": 9592.73, "total_tokens": 1550804880 }, { "epoch": 0.9821830457614403, "grad_norm": 0.9638034105300903, "learning_rate": 2e-05, "loss": 0.6557, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15711, "tokens_per_second_per_gpu": 11218.05, "total_tokens": 1550908885 }, { "epoch": 0.9822455613903476, "grad_norm": 0.8984273672103882, "learning_rate": 2e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15712, "tokens_per_second_per_gpu": 10543.97, "total_tokens": 1551009092 }, { "epoch": 0.9823080770192548, "grad_norm": 0.8584592938423157, "learning_rate": 2e-05, "loss": 0.6082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15713, "tokens_per_second_per_gpu": 10424.1, "total_tokens": 1551108371 }, { "epoch": 0.9823705926481621, "grad_norm": 0.8917186856269836, "learning_rate": 2e-05, "loss": 0.6144, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15714, "tokens_per_second_per_gpu": 11029.3, "total_tokens": 1551209635 }, { "epoch": 0.9824331082770693, "grad_norm": 0.872397243976593, "learning_rate": 2e-05, "loss": 0.6184, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15715, "tokens_per_second_per_gpu": 11526.21, "total_tokens": 1551313838 }, { "epoch": 0.9824956239059764, "grad_norm": 0.89053875207901, "learning_rate": 2e-05, "loss": 0.6062, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15716, "tokens_per_second_per_gpu": 10667.7, "total_tokens": 1551413591 }, { "epoch": 0.9825581395348837, "grad_norm": 0.8524038791656494, "learning_rate": 2e-05, "loss": 0.6181, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15717, "tokens_per_second_per_gpu": 10477.22, "total_tokens": 1551516043 }, { "epoch": 0.9826206551637909, "grad_norm": 0.8955347537994385, "learning_rate": 2e-05, "loss": 0.6281, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15718, "tokens_per_second_per_gpu": 11241.75, "total_tokens": 1551616869 }, { "epoch": 0.9826831707926982, "grad_norm": 0.8621971011161804, "learning_rate": 2e-05, "loss": 0.5852, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15719, "tokens_per_second_per_gpu": 10376.15, "total_tokens": 1551715107 }, { "epoch": 0.9827456864216054, "grad_norm": 0.9143784642219543, "learning_rate": 2e-05, "loss": 0.5997, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15720, "tokens_per_second_per_gpu": 9804.78, "total_tokens": 1551806765 }, { "epoch": 0.9828082020505127, "grad_norm": 0.8833112120628357, "learning_rate": 2e-05, "loss": 0.6031, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15721, "tokens_per_second_per_gpu": 14382.72, "total_tokens": 1551908579 }, { "epoch": 0.9828707176794198, "grad_norm": 0.9098621010780334, "learning_rate": 2e-05, "loss": 0.606, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15722, "tokens_per_second_per_gpu": 10422.82, "total_tokens": 1552009702 }, { "epoch": 0.982933233308327, "grad_norm": 0.894778847694397, "learning_rate": 2e-05, "loss": 0.6169, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15723, "tokens_per_second_per_gpu": 11309.05, "total_tokens": 1552112346 }, { "epoch": 0.9829957489372343, "grad_norm": 0.902157723903656, "learning_rate": 2e-05, "loss": 0.5958, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15724, "tokens_per_second_per_gpu": 10605.34, "total_tokens": 1552211184 }, { "epoch": 0.9830582645661415, "grad_norm": 0.8738215565681458, "learning_rate": 2e-05, "loss": 0.6514, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15725, "tokens_per_second_per_gpu": 11434.86, "total_tokens": 1552318789 }, { "epoch": 0.9831207801950488, "grad_norm": 0.8946260213851929, "learning_rate": 2e-05, "loss": 0.5727, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15726, "tokens_per_second_per_gpu": 10480.21, "total_tokens": 1552417074 }, { "epoch": 0.983183295823956, "grad_norm": 0.9640561938285828, "learning_rate": 2e-05, "loss": 0.6344, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15727, "tokens_per_second_per_gpu": 10837.09, "total_tokens": 1552516829 }, { "epoch": 0.9832458114528632, "grad_norm": 0.8791601657867432, "learning_rate": 2e-05, "loss": 0.6057, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15728, "tokens_per_second_per_gpu": 10125.62, "total_tokens": 1552615760 }, { "epoch": 0.9833083270817704, "grad_norm": 0.8604347109794617, "learning_rate": 2e-05, "loss": 0.6129, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15729, "tokens_per_second_per_gpu": 10959.8, "total_tokens": 1552720828 }, { "epoch": 0.9833708427106777, "grad_norm": 0.8682284355163574, "learning_rate": 2e-05, "loss": 0.5508, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15730, "tokens_per_second_per_gpu": 9930.66, "total_tokens": 1552820544 }, { "epoch": 0.9834333583395849, "grad_norm": 0.8792428374290466, "learning_rate": 2e-05, "loss": 0.5684, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15731, "tokens_per_second_per_gpu": 10169.77, "total_tokens": 1552918421 }, { "epoch": 0.9834958739684921, "grad_norm": 0.9059048891067505, "learning_rate": 2e-05, "loss": 0.6008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15732, "tokens_per_second_per_gpu": 9918.06, "total_tokens": 1553014161 }, { "epoch": 0.9835583895973994, "grad_norm": 0.8852452635765076, "learning_rate": 2e-05, "loss": 0.6637, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15733, "tokens_per_second_per_gpu": 11315.95, "total_tokens": 1553117164 }, { "epoch": 0.9836209052263066, "grad_norm": 0.8955448269844055, "learning_rate": 2e-05, "loss": 0.598, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15734, "tokens_per_second_per_gpu": 10466.17, "total_tokens": 1553214005 }, { "epoch": 0.9836834208552138, "grad_norm": 0.8990578055381775, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15735, "tokens_per_second_per_gpu": 9921.2, "total_tokens": 1553311556 }, { "epoch": 0.983745936484121, "grad_norm": 0.9035133719444275, "learning_rate": 2e-05, "loss": 0.6007, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15736, "tokens_per_second_per_gpu": 9653.16, "total_tokens": 1553408268 }, { "epoch": 0.9838084521130283, "grad_norm": 0.8839661478996277, "learning_rate": 2e-05, "loss": 0.5826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15737, "tokens_per_second_per_gpu": 10476.32, "total_tokens": 1553505406 }, { "epoch": 0.9838709677419355, "grad_norm": 0.8779062032699585, "learning_rate": 2e-05, "loss": 0.5991, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15738, "tokens_per_second_per_gpu": 10649.51, "total_tokens": 1553603864 }, { "epoch": 0.9839334833708427, "grad_norm": 0.8756689429283142, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15739, "tokens_per_second_per_gpu": 10693.0, "total_tokens": 1553701138 }, { "epoch": 0.98399599899975, "grad_norm": 0.8624169826507568, "learning_rate": 2e-05, "loss": 0.6035, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15740, "tokens_per_second_per_gpu": 11252.68, "total_tokens": 1553802274 }, { "epoch": 0.9840585146286571, "grad_norm": 0.8936449885368347, "learning_rate": 2e-05, "loss": 0.5785, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15741, "tokens_per_second_per_gpu": 10891.98, "total_tokens": 1553899009 }, { "epoch": 0.9841210302575644, "grad_norm": 0.8812035322189331, "learning_rate": 2e-05, "loss": 0.6418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15742, "tokens_per_second_per_gpu": 10852.74, "total_tokens": 1553998325 }, { "epoch": 0.9841835458864716, "grad_norm": 0.8782984614372253, "learning_rate": 2e-05, "loss": 0.5816, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15743, "tokens_per_second_per_gpu": 10260.69, "total_tokens": 1554095939 }, { "epoch": 0.9842460615153789, "grad_norm": 0.8842014074325562, "learning_rate": 2e-05, "loss": 0.5995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15744, "tokens_per_second_per_gpu": 9932.23, "total_tokens": 1554193336 }, { "epoch": 0.9843085771442861, "grad_norm": 0.8691210746765137, "learning_rate": 2e-05, "loss": 0.5907, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15745, "tokens_per_second_per_gpu": 10629.92, "total_tokens": 1554293037 }, { "epoch": 0.9843710927731933, "grad_norm": 0.8721778392791748, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15746, "tokens_per_second_per_gpu": 10551.49, "total_tokens": 1554396528 }, { "epoch": 0.9844336084021005, "grad_norm": 0.8832220435142517, "learning_rate": 2e-05, "loss": 0.6082, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15747, "tokens_per_second_per_gpu": 11159.77, "total_tokens": 1554499579 }, { "epoch": 0.9844961240310077, "grad_norm": 0.8803620338439941, "learning_rate": 2e-05, "loss": 0.5828, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15748, "tokens_per_second_per_gpu": 11252.95, "total_tokens": 1554601522 }, { "epoch": 0.984558639659915, "grad_norm": 0.8720315098762512, "learning_rate": 2e-05, "loss": 0.5904, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15749, "tokens_per_second_per_gpu": 10476.64, "total_tokens": 1554704821 }, { "epoch": 0.9846211552888222, "grad_norm": 0.8920251727104187, "learning_rate": 2e-05, "loss": 0.5992, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15750, "tokens_per_second_per_gpu": 10382.26, "total_tokens": 1554804346 }, { "epoch": 0.9846836709177295, "grad_norm": 0.9023668766021729, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15751, "tokens_per_second_per_gpu": 10350.0, "total_tokens": 1554899260 }, { "epoch": 0.9847461865466367, "grad_norm": 0.9039082527160645, "learning_rate": 2e-05, "loss": 0.6471, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15752, "tokens_per_second_per_gpu": 10629.28, "total_tokens": 1554996225 }, { "epoch": 0.9848087021755438, "grad_norm": 0.9333465099334717, "learning_rate": 2e-05, "loss": 0.615, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15753, "tokens_per_second_per_gpu": 10160.53, "total_tokens": 1555090387 }, { "epoch": 0.9848712178044511, "grad_norm": 0.8653504252433777, "learning_rate": 2e-05, "loss": 0.5957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15754, "tokens_per_second_per_gpu": 10605.81, "total_tokens": 1555192262 }, { "epoch": 0.9849337334333583, "grad_norm": 0.8847772479057312, "learning_rate": 2e-05, "loss": 0.5741, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15755, "tokens_per_second_per_gpu": 9196.12, "total_tokens": 1555287577 }, { "epoch": 0.9849962490622656, "grad_norm": 0.8833471536636353, "learning_rate": 2e-05, "loss": 0.5977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15756, "tokens_per_second_per_gpu": 10157.21, "total_tokens": 1555388026 }, { "epoch": 0.9850587646911728, "grad_norm": 0.8624196648597717, "learning_rate": 2e-05, "loss": 0.6068, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15757, "tokens_per_second_per_gpu": 10312.06, "total_tokens": 1555489400 }, { "epoch": 0.9851212803200801, "grad_norm": 0.9068976640701294, "learning_rate": 2e-05, "loss": 0.6119, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15758, "tokens_per_second_per_gpu": 11180.67, "total_tokens": 1555593124 }, { "epoch": 0.9851837959489872, "grad_norm": 0.8537363409996033, "learning_rate": 2e-05, "loss": 0.6084, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15759, "tokens_per_second_per_gpu": 10943.82, "total_tokens": 1555694216 }, { "epoch": 0.9852463115778944, "grad_norm": 0.9052940607070923, "learning_rate": 2e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15760, "tokens_per_second_per_gpu": 10171.04, "total_tokens": 1555790553 }, { "epoch": 0.9853088272068017, "grad_norm": 0.9278081655502319, "learning_rate": 2e-05, "loss": 0.597, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15761, "tokens_per_second_per_gpu": 10895.85, "total_tokens": 1555891639 }, { "epoch": 0.9853713428357089, "grad_norm": 0.8791849613189697, "learning_rate": 2e-05, "loss": 0.6463, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15762, "tokens_per_second_per_gpu": 10769.15, "total_tokens": 1555993539 }, { "epoch": 0.9854338584646162, "grad_norm": 0.9067001938819885, "learning_rate": 2e-05, "loss": 0.6092, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15763, "tokens_per_second_per_gpu": 10637.19, "total_tokens": 1556096373 }, { "epoch": 0.9854963740935234, "grad_norm": 0.8435015678405762, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15764, "tokens_per_second_per_gpu": 11065.39, "total_tokens": 1556197443 }, { "epoch": 0.9855588897224306, "grad_norm": 0.8739741444587708, "learning_rate": 2e-05, "loss": 0.6002, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15765, "tokens_per_second_per_gpu": 11055.29, "total_tokens": 1556298370 }, { "epoch": 0.9856214053513378, "grad_norm": 0.899983286857605, "learning_rate": 2e-05, "loss": 0.5995, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15766, "tokens_per_second_per_gpu": 10099.54, "total_tokens": 1556396778 }, { "epoch": 0.985683920980245, "grad_norm": 0.866172730922699, "learning_rate": 2e-05, "loss": 0.5712, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15767, "tokens_per_second_per_gpu": 10793.87, "total_tokens": 1556494499 }, { "epoch": 0.9857464366091523, "grad_norm": 0.8624178171157837, "learning_rate": 2e-05, "loss": 0.5733, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15768, "tokens_per_second_per_gpu": 9813.29, "total_tokens": 1556588977 }, { "epoch": 0.9858089522380595, "grad_norm": 0.8960638046264648, "learning_rate": 2e-05, "loss": 0.6032, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15769, "tokens_per_second_per_gpu": 10755.8, "total_tokens": 1556685792 }, { "epoch": 0.9858714678669668, "grad_norm": 0.9296777844429016, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15770, "tokens_per_second_per_gpu": 10365.73, "total_tokens": 1556783318 }, { "epoch": 0.985933983495874, "grad_norm": 0.8875413537025452, "learning_rate": 2e-05, "loss": 0.5993, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15771, "tokens_per_second_per_gpu": 10195.6, "total_tokens": 1556878939 }, { "epoch": 0.9859964991247812, "grad_norm": 0.86409592628479, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15772, "tokens_per_second_per_gpu": 11085.98, "total_tokens": 1556982873 }, { "epoch": 0.9860590147536884, "grad_norm": 0.9047265648841858, "learning_rate": 2e-05, "loss": 0.602, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15773, "tokens_per_second_per_gpu": 10010.34, "total_tokens": 1557080903 }, { "epoch": 0.9861215303825956, "grad_norm": 0.9190396070480347, "learning_rate": 2e-05, "loss": 0.6054, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15774, "tokens_per_second_per_gpu": 10304.41, "total_tokens": 1557177211 }, { "epoch": 0.9861840460115029, "grad_norm": 0.8966732621192932, "learning_rate": 2e-05, "loss": 0.5846, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15775, "tokens_per_second_per_gpu": 10635.44, "total_tokens": 1557276972 }, { "epoch": 0.9862465616404101, "grad_norm": 0.913949728012085, "learning_rate": 2e-05, "loss": 0.6478, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15776, "tokens_per_second_per_gpu": 10140.17, "total_tokens": 1557378482 }, { "epoch": 0.9863090772693174, "grad_norm": 0.8874619007110596, "learning_rate": 2e-05, "loss": 0.586, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15777, "tokens_per_second_per_gpu": 10437.57, "total_tokens": 1557479732 }, { "epoch": 0.9863715928982245, "grad_norm": 0.8766249418258667, "learning_rate": 2e-05, "loss": 0.5809, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15778, "tokens_per_second_per_gpu": 10864.0, "total_tokens": 1557581780 }, { "epoch": 0.9864341085271318, "grad_norm": 0.9463603496551514, "learning_rate": 2e-05, "loss": 0.6018, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15779, "tokens_per_second_per_gpu": 10279.31, "total_tokens": 1557677451 }, { "epoch": 0.986496624156039, "grad_norm": 0.8825604319572449, "learning_rate": 2e-05, "loss": 0.6533, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15780, "tokens_per_second_per_gpu": 11304.89, "total_tokens": 1557781765 }, { "epoch": 0.9865591397849462, "grad_norm": 0.8551387786865234, "learning_rate": 2e-05, "loss": 0.5705, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15781, "tokens_per_second_per_gpu": 11299.03, "total_tokens": 1557882686 }, { "epoch": 0.9866216554138535, "grad_norm": 0.8817214965820312, "learning_rate": 2e-05, "loss": 0.5998, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15782, "tokens_per_second_per_gpu": 10756.97, "total_tokens": 1557980330 }, { "epoch": 0.9866841710427607, "grad_norm": 0.89701908826828, "learning_rate": 2e-05, "loss": 0.5674, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15783, "tokens_per_second_per_gpu": 9849.37, "total_tokens": 1558075348 }, { "epoch": 0.9867466866716679, "grad_norm": 0.9009668231010437, "learning_rate": 2e-05, "loss": 0.5888, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15784, "tokens_per_second_per_gpu": 10287.36, "total_tokens": 1558174021 }, { "epoch": 0.9868092023005751, "grad_norm": 0.8409775495529175, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15785, "tokens_per_second_per_gpu": 10507.9, "total_tokens": 1558273223 }, { "epoch": 0.9868717179294824, "grad_norm": 0.8550457954406738, "learning_rate": 2e-05, "loss": 0.5946, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15786, "tokens_per_second_per_gpu": 10539.06, "total_tokens": 1558371999 }, { "epoch": 0.9869342335583896, "grad_norm": 0.9164126515388489, "learning_rate": 2e-05, "loss": 0.6409, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15787, "tokens_per_second_per_gpu": 10634.53, "total_tokens": 1558473771 }, { "epoch": 0.9869967491872969, "grad_norm": 0.9004088044166565, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15788, "tokens_per_second_per_gpu": 10720.62, "total_tokens": 1558572621 }, { "epoch": 0.9870592648162041, "grad_norm": 0.8746528029441833, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15789, "tokens_per_second_per_gpu": 11013.16, "total_tokens": 1558673711 }, { "epoch": 0.9871217804451112, "grad_norm": 0.8601753115653992, "learning_rate": 2e-05, "loss": 0.5971, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15790, "tokens_per_second_per_gpu": 11014.48, "total_tokens": 1558774809 }, { "epoch": 0.9871842960740185, "grad_norm": 0.9442363381385803, "learning_rate": 2e-05, "loss": 0.62, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15791, "tokens_per_second_per_gpu": 11111.09, "total_tokens": 1558874867 }, { "epoch": 0.9872468117029257, "grad_norm": 0.8847482204437256, "learning_rate": 2e-05, "loss": 0.6039, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15792, "tokens_per_second_per_gpu": 10135.47, "total_tokens": 1558971538 }, { "epoch": 0.987309327331833, "grad_norm": 0.8940105438232422, "learning_rate": 2e-05, "loss": 0.6048, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15793, "tokens_per_second_per_gpu": 9643.95, "total_tokens": 1559065965 }, { "epoch": 0.9873718429607402, "grad_norm": 0.9245887994766235, "learning_rate": 2e-05, "loss": 0.6322, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15794, "tokens_per_second_per_gpu": 9911.03, "total_tokens": 1559162471 }, { "epoch": 0.9874343585896475, "grad_norm": 0.8609752058982849, "learning_rate": 2e-05, "loss": 0.6, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15795, "tokens_per_second_per_gpu": 10660.16, "total_tokens": 1559264073 }, { "epoch": 0.9874968742185546, "grad_norm": 0.8716709017753601, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15796, "tokens_per_second_per_gpu": 10549.73, "total_tokens": 1559367086 }, { "epoch": 0.9875593898474618, "grad_norm": 0.9119961261749268, "learning_rate": 2e-05, "loss": 0.6475, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15797, "tokens_per_second_per_gpu": 10711.88, "total_tokens": 1559467169 }, { "epoch": 0.9876219054763691, "grad_norm": 0.9477757215499878, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15798, "tokens_per_second_per_gpu": 10408.77, "total_tokens": 1559563684 }, { "epoch": 0.9876844211052763, "grad_norm": 0.8982642889022827, "learning_rate": 2e-05, "loss": 0.5957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15799, "tokens_per_second_per_gpu": 10833.34, "total_tokens": 1559658350 }, { "epoch": 0.9877469367341836, "grad_norm": 0.8976955413818359, "learning_rate": 2e-05, "loss": 0.6248, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15800, "tokens_per_second_per_gpu": 10727.01, "total_tokens": 1559758762 }, { "epoch": 0.9878094523630908, "grad_norm": 0.8857718110084534, "learning_rate": 2e-05, "loss": 0.6101, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15801, "tokens_per_second_per_gpu": 10471.08, "total_tokens": 1559857930 }, { "epoch": 0.987871967991998, "grad_norm": 0.8764079213142395, "learning_rate": 2e-05, "loss": 0.6323, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15802, "tokens_per_second_per_gpu": 10560.54, "total_tokens": 1559958281 }, { "epoch": 0.9879344836209052, "grad_norm": 1.0709114074707031, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15803, "tokens_per_second_per_gpu": 10547.21, "total_tokens": 1560058915 }, { "epoch": 0.9879969992498124, "grad_norm": 0.8965790271759033, "learning_rate": 2e-05, "loss": 0.6029, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15804, "tokens_per_second_per_gpu": 10081.99, "total_tokens": 1560156986 }, { "epoch": 0.9880595148787197, "grad_norm": 0.8889206051826477, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15805, "tokens_per_second_per_gpu": 10252.05, "total_tokens": 1560254656 }, { "epoch": 0.9881220305076269, "grad_norm": 0.8948696851730347, "learning_rate": 2e-05, "loss": 0.6495, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15806, "tokens_per_second_per_gpu": 10615.86, "total_tokens": 1560354985 }, { "epoch": 0.9881845461365342, "grad_norm": 0.840465247631073, "learning_rate": 2e-05, "loss": 0.59, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15807, "tokens_per_second_per_gpu": 11090.54, "total_tokens": 1560454333 }, { "epoch": 0.9882470617654414, "grad_norm": 0.8972886204719543, "learning_rate": 2e-05, "loss": 0.6227, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15808, "tokens_per_second_per_gpu": 9955.63, "total_tokens": 1560554991 }, { "epoch": 0.9883095773943485, "grad_norm": 0.9341471791267395, "learning_rate": 2e-05, "loss": 0.6166, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15809, "tokens_per_second_per_gpu": 10657.34, "total_tokens": 1560653167 }, { "epoch": 0.9883720930232558, "grad_norm": 0.9237269759178162, "learning_rate": 2e-05, "loss": 0.5757, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15810, "tokens_per_second_per_gpu": 10965.17, "total_tokens": 1560754232 }, { "epoch": 0.988434608652163, "grad_norm": 0.8975370526313782, "learning_rate": 2e-05, "loss": 0.6834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15811, "tokens_per_second_per_gpu": 10876.14, "total_tokens": 1560855409 }, { "epoch": 0.9884971242810703, "grad_norm": 1.0147837400436401, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15812, "tokens_per_second_per_gpu": 11011.8, "total_tokens": 1560954222 }, { "epoch": 0.9885596399099775, "grad_norm": 0.9029684066772461, "learning_rate": 2e-05, "loss": 0.6043, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15813, "tokens_per_second_per_gpu": 11331.54, "total_tokens": 1561056579 }, { "epoch": 0.9886221555388848, "grad_norm": 0.9042369723320007, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15814, "tokens_per_second_per_gpu": 10356.7, "total_tokens": 1561149675 }, { "epoch": 0.9886846711677919, "grad_norm": 0.9277940392494202, "learning_rate": 2e-05, "loss": 0.6118, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15815, "tokens_per_second_per_gpu": 10920.51, "total_tokens": 1561248149 }, { "epoch": 0.9887471867966992, "grad_norm": 0.8822433948516846, "learning_rate": 2e-05, "loss": 0.5957, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15816, "tokens_per_second_per_gpu": 10426.93, "total_tokens": 1561346121 }, { "epoch": 0.9888097024256064, "grad_norm": 0.9052619934082031, "learning_rate": 2e-05, "loss": 0.5892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15817, "tokens_per_second_per_gpu": 9483.66, "total_tokens": 1561440575 }, { "epoch": 0.9888722180545136, "grad_norm": 0.924513041973114, "learning_rate": 2e-05, "loss": 0.6163, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15818, "tokens_per_second_per_gpu": 9836.28, "total_tokens": 1561536596 }, { "epoch": 0.9889347336834209, "grad_norm": 0.9292891025543213, "learning_rate": 2e-05, "loss": 0.6157, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15819, "tokens_per_second_per_gpu": 10494.45, "total_tokens": 1561634225 }, { "epoch": 0.9889972493123281, "grad_norm": 0.9233109951019287, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15820, "tokens_per_second_per_gpu": 9944.69, "total_tokens": 1561730252 }, { "epoch": 0.9890597649412353, "grad_norm": 0.9116966724395752, "learning_rate": 2e-05, "loss": 0.576, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15821, "tokens_per_second_per_gpu": 10466.61, "total_tokens": 1561827118 }, { "epoch": 0.9891222805701425, "grad_norm": 0.8881847858428955, "learning_rate": 2e-05, "loss": 0.58, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15822, "tokens_per_second_per_gpu": 10239.35, "total_tokens": 1561926642 }, { "epoch": 0.9891847961990498, "grad_norm": 0.9517824649810791, "learning_rate": 2e-05, "loss": 0.5965, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15823, "tokens_per_second_per_gpu": 10280.09, "total_tokens": 1562019906 }, { "epoch": 0.989247311827957, "grad_norm": 0.9306607842445374, "learning_rate": 2e-05, "loss": 0.5962, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15824, "tokens_per_second_per_gpu": 11007.14, "total_tokens": 1562116774 }, { "epoch": 0.9893098274568642, "grad_norm": 0.8731775879859924, "learning_rate": 2e-05, "loss": 0.5925, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15825, "tokens_per_second_per_gpu": 11357.1, "total_tokens": 1562215913 }, { "epoch": 0.9893723430857715, "grad_norm": 0.9046570658683777, "learning_rate": 2e-05, "loss": 0.5753, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15826, "tokens_per_second_per_gpu": 10312.82, "total_tokens": 1562312294 }, { "epoch": 0.9894348587146786, "grad_norm": 0.9336380958557129, "learning_rate": 2e-05, "loss": 0.5781, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15827, "tokens_per_second_per_gpu": 10384.22, "total_tokens": 1562406373 }, { "epoch": 0.9894973743435859, "grad_norm": 0.8557206392288208, "learning_rate": 2e-05, "loss": 0.5984, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15828, "tokens_per_second_per_gpu": 10249.64, "total_tokens": 1562505180 }, { "epoch": 0.9895598899724931, "grad_norm": 0.8804304599761963, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15829, "tokens_per_second_per_gpu": 10123.82, "total_tokens": 1562603487 }, { "epoch": 0.9896224056014004, "grad_norm": 0.8829546570777893, "learning_rate": 2e-05, "loss": 0.6268, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15830, "tokens_per_second_per_gpu": 11057.48, "total_tokens": 1562707388 }, { "epoch": 0.9896849212303076, "grad_norm": 0.9170042276382446, "learning_rate": 2e-05, "loss": 0.5826, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15831, "tokens_per_second_per_gpu": 10998.2, "total_tokens": 1562801966 }, { "epoch": 0.9897474368592148, "grad_norm": 0.912675142288208, "learning_rate": 2e-05, "loss": 0.594, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15832, "tokens_per_second_per_gpu": 10573.83, "total_tokens": 1562901353 }, { "epoch": 0.989809952488122, "grad_norm": 0.9175923466682434, "learning_rate": 2e-05, "loss": 0.6023, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15833, "tokens_per_second_per_gpu": 10448.86, "total_tokens": 1563001654 }, { "epoch": 0.9898724681170292, "grad_norm": 0.8901062607765198, "learning_rate": 2e-05, "loss": 0.6099, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15834, "tokens_per_second_per_gpu": 10675.25, "total_tokens": 1563104593 }, { "epoch": 0.9899349837459365, "grad_norm": 0.9162322282791138, "learning_rate": 2e-05, "loss": 0.567, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15835, "tokens_per_second_per_gpu": 9900.93, "total_tokens": 1563197612 }, { "epoch": 0.9899974993748437, "grad_norm": 0.85677170753479, "learning_rate": 2e-05, "loss": 0.6044, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15836, "tokens_per_second_per_gpu": 10619.97, "total_tokens": 1563298311 }, { "epoch": 0.990060015003751, "grad_norm": 0.880370020866394, "learning_rate": 2e-05, "loss": 0.6052, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15837, "tokens_per_second_per_gpu": 11216.59, "total_tokens": 1563402408 }, { "epoch": 0.9901225306326582, "grad_norm": 0.9254308938980103, "learning_rate": 2e-05, "loss": 0.6243, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15838, "tokens_per_second_per_gpu": 9472.58, "total_tokens": 1563497858 }, { "epoch": 0.9901850462615653, "grad_norm": 0.9541401267051697, "learning_rate": 2e-05, "loss": 0.5807, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15839, "tokens_per_second_per_gpu": 9843.93, "total_tokens": 1563592237 }, { "epoch": 0.9902475618904726, "grad_norm": 0.9471407532691956, "learning_rate": 2e-05, "loss": 0.6137, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15840, "tokens_per_second_per_gpu": 9858.15, "total_tokens": 1563686838 }, { "epoch": 0.9903100775193798, "grad_norm": 0.9169267416000366, "learning_rate": 2e-05, "loss": 0.5813, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15841, "tokens_per_second_per_gpu": 9972.79, "total_tokens": 1563781423 }, { "epoch": 0.9903725931482871, "grad_norm": 0.9193705916404724, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15842, "tokens_per_second_per_gpu": 10166.59, "total_tokens": 1563879656 }, { "epoch": 0.9904351087771943, "grad_norm": 0.8746011257171631, "learning_rate": 2e-05, "loss": 0.5978, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15843, "tokens_per_second_per_gpu": 10457.32, "total_tokens": 1563978604 }, { "epoch": 0.9904976244061016, "grad_norm": 0.8994379043579102, "learning_rate": 2e-05, "loss": 0.6324, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15844, "tokens_per_second_per_gpu": 10562.8, "total_tokens": 1564076591 }, { "epoch": 0.9905601400350088, "grad_norm": 0.9233165383338928, "learning_rate": 2e-05, "loss": 0.5892, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15845, "tokens_per_second_per_gpu": 9372.56, "total_tokens": 1564173730 }, { "epoch": 0.9906226556639159, "grad_norm": 0.9316259026527405, "learning_rate": 2e-05, "loss": 0.6382, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15846, "tokens_per_second_per_gpu": 10735.13, "total_tokens": 1564270508 }, { "epoch": 0.9906851712928232, "grad_norm": 0.9007936120033264, "learning_rate": 2e-05, "loss": 0.5982, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15847, "tokens_per_second_per_gpu": 11126.39, "total_tokens": 1564371551 }, { "epoch": 0.9907476869217304, "grad_norm": 0.8900974988937378, "learning_rate": 2e-05, "loss": 0.5555, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15848, "tokens_per_second_per_gpu": 10191.83, "total_tokens": 1564468407 }, { "epoch": 0.9908102025506377, "grad_norm": 0.8840906620025635, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15849, "tokens_per_second_per_gpu": 10904.06, "total_tokens": 1564567780 }, { "epoch": 0.9908727181795449, "grad_norm": 0.8994986414909363, "learning_rate": 2e-05, "loss": 0.5715, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15850, "tokens_per_second_per_gpu": 10746.07, "total_tokens": 1564664641 }, { "epoch": 0.9909352338084522, "grad_norm": 0.8585104942321777, "learning_rate": 2e-05, "loss": 0.5832, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15851, "tokens_per_second_per_gpu": 10894.75, "total_tokens": 1564764153 }, { "epoch": 0.9909977494373593, "grad_norm": 0.8913384675979614, "learning_rate": 2e-05, "loss": 0.5802, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15852, "tokens_per_second_per_gpu": 9725.53, "total_tokens": 1564858248 }, { "epoch": 0.9910602650662665, "grad_norm": 0.9128180146217346, "learning_rate": 2e-05, "loss": 0.6424, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15853, "tokens_per_second_per_gpu": 11816.8, "total_tokens": 1564958815 }, { "epoch": 0.9911227806951738, "grad_norm": 0.8848622441291809, "learning_rate": 2e-05, "loss": 0.6109, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15854, "tokens_per_second_per_gpu": 11373.8, "total_tokens": 1565062531 }, { "epoch": 0.991185296324081, "grad_norm": 0.8745920658111572, "learning_rate": 2e-05, "loss": 0.5966, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15855, "tokens_per_second_per_gpu": 10420.61, "total_tokens": 1565162871 }, { "epoch": 0.9912478119529883, "grad_norm": 0.9080686569213867, "learning_rate": 2e-05, "loss": 0.6327, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15856, "tokens_per_second_per_gpu": 10032.55, "total_tokens": 1565261615 }, { "epoch": 0.9913103275818955, "grad_norm": 0.873296320438385, "learning_rate": 2e-05, "loss": 0.6036, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15857, "tokens_per_second_per_gpu": 11318.57, "total_tokens": 1565362127 }, { "epoch": 0.9913728432108027, "grad_norm": 0.8975057005882263, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15858, "tokens_per_second_per_gpu": 11129.79, "total_tokens": 1565460774 }, { "epoch": 0.9914353588397099, "grad_norm": 0.8987200856208801, "learning_rate": 2e-05, "loss": 0.5438, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15859, "tokens_per_second_per_gpu": 9663.46, "total_tokens": 1565551748 }, { "epoch": 0.9914978744686171, "grad_norm": 0.9407920837402344, "learning_rate": 2e-05, "loss": 0.6689, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15860, "tokens_per_second_per_gpu": 10573.02, "total_tokens": 1565650983 }, { "epoch": 0.9915603900975244, "grad_norm": 0.9319972395896912, "learning_rate": 2e-05, "loss": 0.5844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15861, "tokens_per_second_per_gpu": 9590.68, "total_tokens": 1565745285 }, { "epoch": 0.9916229057264316, "grad_norm": 0.9298956990242004, "learning_rate": 2e-05, "loss": 0.6286, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15862, "tokens_per_second_per_gpu": 9823.37, "total_tokens": 1565842112 }, { "epoch": 0.9916854213553389, "grad_norm": 0.9238201379776001, "learning_rate": 2e-05, "loss": 0.6096, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15863, "tokens_per_second_per_gpu": 10049.23, "total_tokens": 1565937966 }, { "epoch": 0.991747936984246, "grad_norm": 0.8926572799682617, "learning_rate": 2e-05, "loss": 0.6122, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15864, "tokens_per_second_per_gpu": 11235.69, "total_tokens": 1566038069 }, { "epoch": 0.9918104526131533, "grad_norm": 0.8789674639701843, "learning_rate": 2e-05, "loss": 0.6008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15865, "tokens_per_second_per_gpu": 10933.83, "total_tokens": 1566139589 }, { "epoch": 0.9918729682420605, "grad_norm": 0.8975740671157837, "learning_rate": 2e-05, "loss": 0.5921, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15866, "tokens_per_second_per_gpu": 10566.42, "total_tokens": 1566237615 }, { "epoch": 0.9919354838709677, "grad_norm": 0.8905720114707947, "learning_rate": 2e-05, "loss": 0.5851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15867, "tokens_per_second_per_gpu": 9829.87, "total_tokens": 1566333074 }, { "epoch": 0.991997999499875, "grad_norm": 0.9493891000747681, "learning_rate": 2e-05, "loss": 0.608, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15868, "tokens_per_second_per_gpu": 10583.98, "total_tokens": 1566430201 }, { "epoch": 0.9920605151287822, "grad_norm": 0.88432377576828, "learning_rate": 2e-05, "loss": 0.6034, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15869, "tokens_per_second_per_gpu": 10681.44, "total_tokens": 1566529028 }, { "epoch": 0.9921230307576894, "grad_norm": 0.8754802346229553, "learning_rate": 2e-05, "loss": 0.5844, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15870, "tokens_per_second_per_gpu": 11038.58, "total_tokens": 1566628400 }, { "epoch": 0.9921855463865966, "grad_norm": 0.9070252776145935, "learning_rate": 2e-05, "loss": 0.6125, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15871, "tokens_per_second_per_gpu": 11085.79, "total_tokens": 1566726369 }, { "epoch": 0.9922480620155039, "grad_norm": 0.8696582913398743, "learning_rate": 2e-05, "loss": 0.5743, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15872, "tokens_per_second_per_gpu": 11359.59, "total_tokens": 1566829251 }, { "epoch": 0.9923105776444111, "grad_norm": 0.8727153539657593, "learning_rate": 2e-05, "loss": 0.5806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15873, "tokens_per_second_per_gpu": 10800.08, "total_tokens": 1566927490 }, { "epoch": 0.9923730932733184, "grad_norm": 0.9065549373626709, "learning_rate": 2e-05, "loss": 0.6086, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15874, "tokens_per_second_per_gpu": 11303.41, "total_tokens": 1567026967 }, { "epoch": 0.9924356089022256, "grad_norm": 0.9044862389564514, "learning_rate": 2e-05, "loss": 0.5894, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15875, "tokens_per_second_per_gpu": 10855.67, "total_tokens": 1567123408 }, { "epoch": 0.9924981245311327, "grad_norm": 0.8900455832481384, "learning_rate": 2e-05, "loss": 0.6274, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15876, "tokens_per_second_per_gpu": 10788.5, "total_tokens": 1567224813 }, { "epoch": 0.99256064016004, "grad_norm": 0.8689277172088623, "learning_rate": 2e-05, "loss": 0.5734, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15877, "tokens_per_second_per_gpu": 10540.32, "total_tokens": 1567324450 }, { "epoch": 0.9926231557889472, "grad_norm": 0.9200575351715088, "learning_rate": 2e-05, "loss": 0.6199, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15878, "tokens_per_second_per_gpu": 10440.75, "total_tokens": 1567421351 }, { "epoch": 0.9926856714178545, "grad_norm": 0.9206771850585938, "learning_rate": 2e-05, "loss": 0.627, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15879, "tokens_per_second_per_gpu": 10568.42, "total_tokens": 1567518823 }, { "epoch": 0.9927481870467617, "grad_norm": 0.9374052882194519, "learning_rate": 2e-05, "loss": 0.5878, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15880, "tokens_per_second_per_gpu": 10621.93, "total_tokens": 1567616176 }, { "epoch": 0.992810702675669, "grad_norm": 0.8779627084732056, "learning_rate": 2e-05, "loss": 0.5934, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15881, "tokens_per_second_per_gpu": 10723.89, "total_tokens": 1567715868 }, { "epoch": 0.9928732183045762, "grad_norm": 0.8978123068809509, "learning_rate": 2e-05, "loss": 0.6027, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15882, "tokens_per_second_per_gpu": 11099.67, "total_tokens": 1567814583 }, { "epoch": 0.9929357339334833, "grad_norm": 0.9040428400039673, "learning_rate": 2e-05, "loss": 0.6579, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15883, "tokens_per_second_per_gpu": 10766.69, "total_tokens": 1567913525 }, { "epoch": 0.9929982495623906, "grad_norm": 0.876150369644165, "learning_rate": 2e-05, "loss": 0.5943, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15884, "tokens_per_second_per_gpu": 10941.12, "total_tokens": 1568013614 }, { "epoch": 0.9930607651912978, "grad_norm": 0.9329072833061218, "learning_rate": 2e-05, "loss": 0.6186, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15885, "tokens_per_second_per_gpu": 9937.99, "total_tokens": 1568110624 }, { "epoch": 0.9931232808202051, "grad_norm": 0.8827958703041077, "learning_rate": 2e-05, "loss": 0.6113, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15886, "tokens_per_second_per_gpu": 10549.23, "total_tokens": 1568208445 }, { "epoch": 0.9931857964491123, "grad_norm": 0.9132986068725586, "learning_rate": 2e-05, "loss": 0.6547, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15887, "tokens_per_second_per_gpu": 10813.83, "total_tokens": 1568307789 }, { "epoch": 0.9932483120780196, "grad_norm": 0.8783993721008301, "learning_rate": 2e-05, "loss": 0.6038, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15888, "tokens_per_second_per_gpu": 11305.43, "total_tokens": 1568411859 }, { "epoch": 0.9933108277069267, "grad_norm": 0.8871813416481018, "learning_rate": 2e-05, "loss": 0.6025, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15889, "tokens_per_second_per_gpu": 11001.13, "total_tokens": 1568513113 }, { "epoch": 0.9933733433358339, "grad_norm": 0.9001262784004211, "learning_rate": 2e-05, "loss": 0.5719, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15890, "tokens_per_second_per_gpu": 10561.21, "total_tokens": 1568610588 }, { "epoch": 0.9934358589647412, "grad_norm": 0.8906770944595337, "learning_rate": 2e-05, "loss": 0.6384, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15891, "tokens_per_second_per_gpu": 10642.03, "total_tokens": 1568711665 }, { "epoch": 0.9934983745936484, "grad_norm": 0.8704009056091309, "learning_rate": 2e-05, "loss": 0.5928, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15892, "tokens_per_second_per_gpu": 10226.79, "total_tokens": 1568811080 }, { "epoch": 0.9935608902225557, "grad_norm": 0.9080421328544617, "learning_rate": 2e-05, "loss": 0.609, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15893, "tokens_per_second_per_gpu": 10167.12, "total_tokens": 1568906849 }, { "epoch": 0.9936234058514629, "grad_norm": 0.8829928040504456, "learning_rate": 2e-05, "loss": 0.6033, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15894, "tokens_per_second_per_gpu": 10327.81, "total_tokens": 1569000396 }, { "epoch": 0.99368592148037, "grad_norm": 0.903319776058197, "learning_rate": 2e-05, "loss": 0.6675, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15895, "tokens_per_second_per_gpu": 10509.84, "total_tokens": 1569100011 }, { "epoch": 0.9937484371092773, "grad_norm": 0.902959942817688, "learning_rate": 2e-05, "loss": 0.6276, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15896, "tokens_per_second_per_gpu": 10325.13, "total_tokens": 1569199657 }, { "epoch": 0.9938109527381845, "grad_norm": 0.9078430533409119, "learning_rate": 2e-05, "loss": 0.5909, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15897, "tokens_per_second_per_gpu": 9478.09, "total_tokens": 1569295138 }, { "epoch": 0.9938734683670918, "grad_norm": 0.9089625477790833, "learning_rate": 2e-05, "loss": 0.5944, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15898, "tokens_per_second_per_gpu": 10840.99, "total_tokens": 1569394435 }, { "epoch": 0.993935983995999, "grad_norm": 0.9403334259986877, "learning_rate": 2e-05, "loss": 0.6699, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15899, "tokens_per_second_per_gpu": 10225.69, "total_tokens": 1569489554 }, { "epoch": 0.9939984996249063, "grad_norm": 0.9030489921569824, "learning_rate": 2e-05, "loss": 0.5805, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15900, "tokens_per_second_per_gpu": 9880.77, "total_tokens": 1569585145 }, { "epoch": 0.9940610152538134, "grad_norm": 0.9165311455726624, "learning_rate": 2e-05, "loss": 0.5938, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15901, "tokens_per_second_per_gpu": 10526.69, "total_tokens": 1569680884 }, { "epoch": 0.9941235308827207, "grad_norm": 0.8503122329711914, "learning_rate": 2e-05, "loss": 0.5673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15902, "tokens_per_second_per_gpu": 10013.52, "total_tokens": 1569779986 }, { "epoch": 0.9941860465116279, "grad_norm": 0.8639692068099976, "learning_rate": 2e-05, "loss": 0.5851, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15903, "tokens_per_second_per_gpu": 10525.96, "total_tokens": 1569878272 }, { "epoch": 0.9942485621405351, "grad_norm": 0.8826373219490051, "learning_rate": 2e-05, "loss": 0.6134, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15904, "tokens_per_second_per_gpu": 10191.16, "total_tokens": 1569975885 }, { "epoch": 0.9943110777694424, "grad_norm": 0.876602292060852, "learning_rate": 2e-05, "loss": 0.6217, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15905, "tokens_per_second_per_gpu": 10864.29, "total_tokens": 1570074003 }, { "epoch": 0.9943735933983496, "grad_norm": 0.8997625708580017, "learning_rate": 2e-05, "loss": 0.6351, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15906, "tokens_per_second_per_gpu": 10946.73, "total_tokens": 1570173655 }, { "epoch": 0.9944361090272568, "grad_norm": 0.8602683544158936, "learning_rate": 2e-05, "loss": 0.5951, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15907, "tokens_per_second_per_gpu": 10270.68, "total_tokens": 1570272567 }, { "epoch": 0.994498624656164, "grad_norm": 0.8846794962882996, "learning_rate": 2e-05, "loss": 0.5935, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15908, "tokens_per_second_per_gpu": 10102.04, "total_tokens": 1570368978 }, { "epoch": 0.9945611402850713, "grad_norm": 0.8737883567810059, "learning_rate": 2e-05, "loss": 0.5989, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15909, "tokens_per_second_per_gpu": 10754.01, "total_tokens": 1570468216 }, { "epoch": 0.9946236559139785, "grad_norm": 0.8562959432601929, "learning_rate": 2e-05, "loss": 0.6381, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15910, "tokens_per_second_per_gpu": 10975.01, "total_tokens": 1570571528 }, { "epoch": 0.9946861715428857, "grad_norm": 0.8569046854972839, "learning_rate": 2e-05, "loss": 0.5829, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15911, "tokens_per_second_per_gpu": 11085.29, "total_tokens": 1570672171 }, { "epoch": 0.994748687171793, "grad_norm": 0.9206390380859375, "learning_rate": 2e-05, "loss": 0.6582, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15912, "tokens_per_second_per_gpu": 10215.46, "total_tokens": 1570771408 }, { "epoch": 0.9948112028007001, "grad_norm": 0.9355631470680237, "learning_rate": 2e-05, "loss": 0.6393, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15913, "tokens_per_second_per_gpu": 11363.25, "total_tokens": 1570873271 }, { "epoch": 0.9948737184296074, "grad_norm": 0.9534790515899658, "learning_rate": 2e-05, "loss": 0.6014, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15914, "tokens_per_second_per_gpu": 9602.93, "total_tokens": 1570966442 }, { "epoch": 0.9949362340585146, "grad_norm": 0.8859280943870544, "learning_rate": 2e-05, "loss": 0.5673, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15915, "tokens_per_second_per_gpu": 9866.6, "total_tokens": 1571060985 }, { "epoch": 0.9949987496874219, "grad_norm": 0.9179317951202393, "learning_rate": 2e-05, "loss": 0.5994, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15916, "tokens_per_second_per_gpu": 10355.67, "total_tokens": 1571160114 }, { "epoch": 0.9950612653163291, "grad_norm": 0.9243425726890564, "learning_rate": 2e-05, "loss": 0.5932, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15917, "tokens_per_second_per_gpu": 9650.46, "total_tokens": 1571256065 }, { "epoch": 0.9951237809452363, "grad_norm": 0.8809751868247986, "learning_rate": 2e-05, "loss": 0.6683, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15918, "tokens_per_second_per_gpu": 10650.43, "total_tokens": 1571357178 }, { "epoch": 0.9951862965741435, "grad_norm": 0.9216481447219849, "learning_rate": 2e-05, "loss": 0.5864, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15919, "tokens_per_second_per_gpu": 10777.94, "total_tokens": 1571453741 }, { "epoch": 0.9952488122030507, "grad_norm": 0.9298375844955444, "learning_rate": 2e-05, "loss": 0.5868, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15920, "tokens_per_second_per_gpu": 9865.02, "total_tokens": 1571546552 }, { "epoch": 0.995311327831958, "grad_norm": 0.8625885844230652, "learning_rate": 2e-05, "loss": 0.5702, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15921, "tokens_per_second_per_gpu": 9699.78, "total_tokens": 1571641523 }, { "epoch": 0.9953738434608652, "grad_norm": 0.8680540919303894, "learning_rate": 2e-05, "loss": 0.6259, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15922, "tokens_per_second_per_gpu": 10431.37, "total_tokens": 1571739886 }, { "epoch": 0.9954363590897725, "grad_norm": 0.8703250885009766, "learning_rate": 2e-05, "loss": 0.6481, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15923, "tokens_per_second_per_gpu": 10838.92, "total_tokens": 1571842464 }, { "epoch": 0.9954988747186797, "grad_norm": 0.8825011849403381, "learning_rate": 2e-05, "loss": 0.6081, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15924, "tokens_per_second_per_gpu": 10693.15, "total_tokens": 1571940982 }, { "epoch": 0.995561390347587, "grad_norm": 0.9170737266540527, "learning_rate": 2e-05, "loss": 0.5903, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15925, "tokens_per_second_per_gpu": 10927.76, "total_tokens": 1572040101 }, { "epoch": 0.9956239059764941, "grad_norm": 0.9037505388259888, "learning_rate": 2e-05, "loss": 0.6231, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15926, "tokens_per_second_per_gpu": 10614.25, "total_tokens": 1572134026 }, { "epoch": 0.9956864216054013, "grad_norm": 0.8876904249191284, "learning_rate": 2e-05, "loss": 0.6158, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15927, "tokens_per_second_per_gpu": 10788.52, "total_tokens": 1572232770 }, { "epoch": 0.9957489372343086, "grad_norm": 0.8798724412918091, "learning_rate": 2e-05, "loss": 0.5738, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15928, "tokens_per_second_per_gpu": 10288.02, "total_tokens": 1572331883 }, { "epoch": 0.9958114528632158, "grad_norm": 0.9245285391807556, "learning_rate": 2e-05, "loss": 0.5983, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15929, "tokens_per_second_per_gpu": 9812.76, "total_tokens": 1572424218 }, { "epoch": 0.9958739684921231, "grad_norm": 0.9349841475486755, "learning_rate": 2e-05, "loss": 0.643, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15930, "tokens_per_second_per_gpu": 10475.83, "total_tokens": 1572522409 }, { "epoch": 0.9959364841210303, "grad_norm": 0.8968011736869812, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15931, "tokens_per_second_per_gpu": 10375.58, "total_tokens": 1572620364 }, { "epoch": 0.9959989997499374, "grad_norm": 0.8812810182571411, "learning_rate": 2e-05, "loss": 0.6251, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15932, "tokens_per_second_per_gpu": 10530.58, "total_tokens": 1572719392 }, { "epoch": 0.9960615153788447, "grad_norm": 0.9312660694122314, "learning_rate": 2e-05, "loss": 0.6173, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15933, "tokens_per_second_per_gpu": 10958.4, "total_tokens": 1572818129 }, { "epoch": 0.9961240310077519, "grad_norm": 0.8656426072120667, "learning_rate": 2e-05, "loss": 0.6159, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15934, "tokens_per_second_per_gpu": 10053.57, "total_tokens": 1572913044 }, { "epoch": 0.9961865466366592, "grad_norm": 0.9174478650093079, "learning_rate": 2e-05, "loss": 0.6254, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15935, "tokens_per_second_per_gpu": 10015.22, "total_tokens": 1573011885 }, { "epoch": 0.9962490622655664, "grad_norm": 0.8905065059661865, "learning_rate": 2e-05, "loss": 0.5871, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15936, "tokens_per_second_per_gpu": 10107.53, "total_tokens": 1573108719 }, { "epoch": 0.9963115778944737, "grad_norm": 0.8764336109161377, "learning_rate": 2e-05, "loss": 0.622, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15937, "tokens_per_second_per_gpu": 10086.76, "total_tokens": 1573203827 }, { "epoch": 0.9963740935233808, "grad_norm": 0.9234260320663452, "learning_rate": 2e-05, "loss": 0.5949, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15938, "tokens_per_second_per_gpu": 10175.18, "total_tokens": 1573297056 }, { "epoch": 0.996436609152288, "grad_norm": 0.9053216576576233, "learning_rate": 2e-05, "loss": 0.5977, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15939, "tokens_per_second_per_gpu": 10933.56, "total_tokens": 1573394544 }, { "epoch": 0.9964991247811953, "grad_norm": 0.8521853089332581, "learning_rate": 2e-05, "loss": 0.5507, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15940, "tokens_per_second_per_gpu": 9973.33, "total_tokens": 1573490691 }, { "epoch": 0.9965616404101025, "grad_norm": 0.8798545002937317, "learning_rate": 2e-05, "loss": 0.5806, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15941, "tokens_per_second_per_gpu": 9919.38, "total_tokens": 1573585450 }, { "epoch": 0.9966241560390098, "grad_norm": 0.8915892243385315, "learning_rate": 2e-05, "loss": 0.591, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15942, "tokens_per_second_per_gpu": 10406.71, "total_tokens": 1573686369 }, { "epoch": 0.996686671667917, "grad_norm": 0.8984184861183167, "learning_rate": 2e-05, "loss": 0.6104, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15943, "tokens_per_second_per_gpu": 9923.12, "total_tokens": 1573781514 }, { "epoch": 0.9967491872968242, "grad_norm": 0.886726438999176, "learning_rate": 2e-05, "loss": 0.6008, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15944, "tokens_per_second_per_gpu": 9936.78, "total_tokens": 1573880448 }, { "epoch": 0.9968117029257314, "grad_norm": 0.9228744506835938, "learning_rate": 2e-05, "loss": 0.6175, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15945, "tokens_per_second_per_gpu": 10842.42, "total_tokens": 1573976723 }, { "epoch": 0.9968742185546386, "grad_norm": 0.9257989525794983, "learning_rate": 2e-05, "loss": 0.6368, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15946, "tokens_per_second_per_gpu": 10209.35, "total_tokens": 1574072967 }, { "epoch": 0.9969367341835459, "grad_norm": 0.8690028786659241, "learning_rate": 2e-05, "loss": 0.5876, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15947, "tokens_per_second_per_gpu": 10381.09, "total_tokens": 1574171296 }, { "epoch": 0.9969992498124531, "grad_norm": 0.9001195430755615, "learning_rate": 2e-05, "loss": 0.6293, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15948, "tokens_per_second_per_gpu": 10012.79, "total_tokens": 1574268263 }, { "epoch": 0.9970617654413604, "grad_norm": 0.9336002469062805, "learning_rate": 2e-05, "loss": 0.6441, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15949, "tokens_per_second_per_gpu": 10548.66, "total_tokens": 1574368410 }, { "epoch": 0.9971242810702675, "grad_norm": 0.8836472034454346, "learning_rate": 2e-05, "loss": 0.6155, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15950, "tokens_per_second_per_gpu": 9992.67, "total_tokens": 1574465178 }, { "epoch": 0.9971867966991748, "grad_norm": 0.8762037754058838, "learning_rate": 2e-05, "loss": 0.6235, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15951, "tokens_per_second_per_gpu": 10374.35, "total_tokens": 1574567219 }, { "epoch": 0.997249312328082, "grad_norm": 0.8937474489212036, "learning_rate": 2e-05, "loss": 0.6106, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15952, "tokens_per_second_per_gpu": 10623.92, "total_tokens": 1574667883 }, { "epoch": 0.9973118279569892, "grad_norm": 0.8676392436027527, "learning_rate": 2e-05, "loss": 0.593, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15953, "tokens_per_second_per_gpu": 10825.91, "total_tokens": 1574765687 }, { "epoch": 0.9973743435858965, "grad_norm": 0.9323796033859253, "learning_rate": 2e-05, "loss": 0.6051, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15954, "tokens_per_second_per_gpu": 10160.98, "total_tokens": 1574859083 }, { "epoch": 0.9974368592148037, "grad_norm": 0.8754019737243652, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15955, "tokens_per_second_per_gpu": 10796.56, "total_tokens": 1574957252 }, { "epoch": 0.9974993748437109, "grad_norm": 0.8780397772789001, "learning_rate": 2e-05, "loss": 0.6177, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15956, "tokens_per_second_per_gpu": 10889.8, "total_tokens": 1575056824 }, { "epoch": 0.9975618904726181, "grad_norm": 0.945154070854187, "learning_rate": 2e-05, "loss": 0.6145, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15957, "tokens_per_second_per_gpu": 10284.41, "total_tokens": 1575150605 }, { "epoch": 0.9976244061015254, "grad_norm": 0.897677481174469, "learning_rate": 2e-05, "loss": 0.6452, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15958, "tokens_per_second_per_gpu": 11062.7, "total_tokens": 1575251643 }, { "epoch": 0.9976869217304326, "grad_norm": 0.9515111446380615, "learning_rate": 2e-05, "loss": 0.5865, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15959, "tokens_per_second_per_gpu": 14160.41, "total_tokens": 1575345847 }, { "epoch": 0.9977494373593399, "grad_norm": 0.8954275250434875, "learning_rate": 2e-05, "loss": 0.6355, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15960, "tokens_per_second_per_gpu": 11263.71, "total_tokens": 1575446567 }, { "epoch": 0.9978119529882471, "grad_norm": 0.9119465947151184, "learning_rate": 2e-05, "loss": 0.6156, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15961, "tokens_per_second_per_gpu": 10414.8, "total_tokens": 1575546263 }, { "epoch": 0.9978744686171543, "grad_norm": 0.8830984234809875, "learning_rate": 2e-05, "loss": 0.6174, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15962, "tokens_per_second_per_gpu": 9813.32, "total_tokens": 1575643744 }, { "epoch": 0.9979369842460615, "grad_norm": 0.8855180740356445, "learning_rate": 2e-05, "loss": 0.5462, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15963, "tokens_per_second_per_gpu": 9735.37, "total_tokens": 1575739721 }, { "epoch": 0.9979994998749687, "grad_norm": 0.918298065662384, "learning_rate": 2e-05, "loss": 0.628, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15964, "tokens_per_second_per_gpu": 10661.76, "total_tokens": 1575834733 }, { "epoch": 0.998062015503876, "grad_norm": 0.940884530544281, "learning_rate": 2e-05, "loss": 0.6215, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15965, "tokens_per_second_per_gpu": 9679.53, "total_tokens": 1575927015 }, { "epoch": 0.9981245311327832, "grad_norm": 0.8856614232063293, "learning_rate": 2e-05, "loss": 0.6262, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15966, "tokens_per_second_per_gpu": 10423.17, "total_tokens": 1576028566 }, { "epoch": 0.9981870467616905, "grad_norm": 0.9060172438621521, "learning_rate": 2e-05, "loss": 0.6249, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15967, "tokens_per_second_per_gpu": 11288.07, "total_tokens": 1576129103 }, { "epoch": 0.9982495623905977, "grad_norm": 0.910991907119751, "learning_rate": 2e-05, "loss": 0.6385, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15968, "tokens_per_second_per_gpu": 11038.13, "total_tokens": 1576231633 }, { "epoch": 0.9983120780195048, "grad_norm": 0.9277207255363464, "learning_rate": 2e-05, "loss": 0.6479, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15969, "tokens_per_second_per_gpu": 10546.88, "total_tokens": 1576329901 }, { "epoch": 0.9983745936484121, "grad_norm": 0.8840106725692749, "learning_rate": 2e-05, "loss": 0.6065, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15970, "tokens_per_second_per_gpu": 10140.63, "total_tokens": 1576427721 }, { "epoch": 0.9984371092773193, "grad_norm": 0.9057885408401489, "learning_rate": 2e-05, "loss": 0.5886, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15971, "tokens_per_second_per_gpu": 9804.21, "total_tokens": 1576522590 }, { "epoch": 0.9984996249062266, "grad_norm": 0.8748772144317627, "learning_rate": 2e-05, "loss": 0.5976, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15972, "tokens_per_second_per_gpu": 10612.35, "total_tokens": 1576619670 }, { "epoch": 0.9985621405351338, "grad_norm": 0.9160751104354858, "learning_rate": 2e-05, "loss": 0.63, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15973, "tokens_per_second_per_gpu": 10006.86, "total_tokens": 1576715331 }, { "epoch": 0.9986246561640411, "grad_norm": 0.9305205941200256, "learning_rate": 2e-05, "loss": 0.5956, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15974, "tokens_per_second_per_gpu": 10421.11, "total_tokens": 1576812132 }, { "epoch": 0.9986871717929482, "grad_norm": 0.8722079396247864, "learning_rate": 2e-05, "loss": 0.6138, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15975, "tokens_per_second_per_gpu": 10540.14, "total_tokens": 1576910719 }, { "epoch": 0.9987496874218554, "grad_norm": 0.912230372428894, "learning_rate": 2e-05, "loss": 0.6298, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15976, "tokens_per_second_per_gpu": 9269.95, "total_tokens": 1577003970 }, { "epoch": 0.9988122030507627, "grad_norm": 0.9397334456443787, "learning_rate": 2e-05, "loss": 0.5834, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15977, "tokens_per_second_per_gpu": 10475.98, "total_tokens": 1577096294 }, { "epoch": 0.9988747186796699, "grad_norm": 0.8853858113288879, "learning_rate": 2e-05, "loss": 0.5819, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15978, "tokens_per_second_per_gpu": 9964.87, "total_tokens": 1577189064 }, { "epoch": 0.9989372343085772, "grad_norm": 0.9105007648468018, "learning_rate": 2e-05, "loss": 0.5686, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15979, "tokens_per_second_per_gpu": 10066.67, "total_tokens": 1577284738 }, { "epoch": 0.9989997499374844, "grad_norm": 0.8816986680030823, "learning_rate": 2e-05, "loss": 0.6444, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15980, "tokens_per_second_per_gpu": 10409.96, "total_tokens": 1577382685 }, { "epoch": 0.9990622655663915, "grad_norm": 0.8864342570304871, "learning_rate": 2e-05, "loss": 0.5418, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15981, "tokens_per_second_per_gpu": 9387.98, "total_tokens": 1577472971 }, { "epoch": 0.9991247811952988, "grad_norm": 0.8612464666366577, "learning_rate": 2e-05, "loss": 0.5723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15982, "tokens_per_second_per_gpu": 9898.58, "total_tokens": 1577567546 }, { "epoch": 0.999187296824206, "grad_norm": 0.9199320673942566, "learning_rate": 2e-05, "loss": 0.6723, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15983, "tokens_per_second_per_gpu": 10976.84, "total_tokens": 1577667758 }, { "epoch": 0.9992498124531133, "grad_norm": 0.9476816654205322, "learning_rate": 2e-05, "loss": 0.6361, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15984, "tokens_per_second_per_gpu": 9729.24, "total_tokens": 1577763604 }, { "epoch": 0.9993123280820205, "grad_norm": 0.899088442325592, "learning_rate": 2e-05, "loss": 0.6736, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15985, "tokens_per_second_per_gpu": 9430.29, "total_tokens": 1577859155 }, { "epoch": 0.9993748437109278, "grad_norm": 0.8974953889846802, "learning_rate": 2e-05, "loss": 0.6516, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15986, "tokens_per_second_per_gpu": 9722.24, "total_tokens": 1577952699 }, { "epoch": 0.9994373593398349, "grad_norm": 0.9061396718025208, "learning_rate": 2e-05, "loss": 0.5855, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15987, "tokens_per_second_per_gpu": 9940.49, "total_tokens": 1578047315 }, { "epoch": 0.9994998749687422, "grad_norm": 0.9166470766067505, "learning_rate": 2e-05, "loss": 0.6226, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15988, "tokens_per_second_per_gpu": 10398.6, "total_tokens": 1578143927 }, { "epoch": 0.9995623905976494, "grad_norm": 0.8588995337486267, "learning_rate": 2e-05, "loss": 0.6107, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15989, "tokens_per_second_per_gpu": 10573.83, "total_tokens": 1578241941 }, { "epoch": 0.9996249062265566, "grad_norm": 0.8772311210632324, "learning_rate": 2e-05, "loss": 0.6042, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15990, "tokens_per_second_per_gpu": 10119.82, "total_tokens": 1578340483 }, { "epoch": 0.9996874218554639, "grad_norm": 0.9409469962120056, "learning_rate": 2e-05, "loss": 0.6372, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15991, "tokens_per_second_per_gpu": 11537.08, "total_tokens": 1578438408 }, { "epoch": 0.9997499374843711, "grad_norm": 0.8814605474472046, "learning_rate": 2e-05, "loss": 0.6083, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15992, "tokens_per_second_per_gpu": 10512.29, "total_tokens": 1578537439 }, { "epoch": 0.9998124531132783, "grad_norm": 0.9023396968841553, "learning_rate": 2e-05, "loss": 0.5975, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15993, "tokens_per_second_per_gpu": 9928.6, "total_tokens": 1578633258 }, { "epoch": 0.9998749687421855, "grad_norm": 0.8992229700088501, "learning_rate": 2e-05, "loss": 0.6234, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15994, "tokens_per_second_per_gpu": 10363.65, "total_tokens": 1578728333 }, { "epoch": 0.9999374843710928, "grad_norm": 0.9204303622245789, "learning_rate": 2e-05, "loss": 0.6015, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15995, "tokens_per_second_per_gpu": 10089.58, "total_tokens": 1578823676 }, { "epoch": 1.0, "grad_norm": 0.8932250738143921, "learning_rate": 2e-05, "loss": 0.6697, "memory/device_reserved (GiB)": 45.5, "memory/max_active (GiB)": 44.03, "memory/max_allocated (GiB)": 44.03, "step": 15996, "tokens_per_second_per_gpu": 9881.07, "total_tokens": 1578920701 } ], "logging_steps": 1, "max_steps": 15996, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 7998, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.5459183371169563e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }