{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.250562640660165, "eval_steps": 1000, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 1.6887853145599365, "eval_ppl": 5.4129, "eval_runtime": 167.3526, "eval_samples_per_second": 4.362, "eval_steps_per_second": 0.872, "memory/device_reserved (GiB)": 139.12, "memory/max_active (GiB)": 18.94, "memory/max_allocated (GiB)": 18.94, "step": 0 }, { "epoch": 0.018754688672168042, "grad_norm": 1.415561556816101, "learning_rate": 4.8e-05, "loss": 1.6848, "memory/device_reserved (GiB)": 139.11, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 5.3914, "step": 25, "tokens_per_second_per_gpu": 16277.76, "total_tokens": 1723633 }, { "epoch": 0.037509377344336084, "grad_norm": 0.33179354667663574, "learning_rate": 9.8e-05, "loss": 0.9839, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 2.6749, "step": 50, "tokens_per_second_per_gpu": 4303.21, "total_tokens": 2175386 }, { "epoch": 0.056264066016504126, "grad_norm": 0.17453454434871674, "learning_rate": 0.000148, "loss": 0.8002, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 2.226, "step": 75, "tokens_per_second_per_gpu": 3776.03, "total_tokens": 2623712 }, { "epoch": 0.07501875468867217, "grad_norm": 0.19318008422851562, "learning_rate": 0.00019800000000000002, "loss": 0.7218, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 2.0581, "step": 100, "tokens_per_second_per_gpu": 4252.49, "total_tokens": 3072519 }, { "epoch": 0.09377344336084022, "grad_norm": 0.18435686826705933, "learning_rate": 0.00019998127418269004, "loss": 0.6759, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.9658, "step": 125, "tokens_per_second_per_gpu": 4303.31, "total_tokens": 3523983 }, { "epoch": 0.11252813203300825, "grad_norm": 0.19870473444461823, "learning_rate": 0.00019992195096972548, "loss": 0.6703, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.9548, "step": 150, "tokens_per_second_per_gpu": 4260.86, "total_tokens": 3973452 }, { "epoch": 0.1312828207051763, "grad_norm": 0.20499658584594727, "learning_rate": 0.0001998220219574743, "loss": 0.6381, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8929, "step": 175, "tokens_per_second_per_gpu": 4288.64, "total_tokens": 4423763 }, { "epoch": 0.15003750937734434, "grad_norm": 0.18934418261051178, "learning_rate": 0.00019968152775460537, "loss": 0.6383, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8933, "step": 200, "tokens_per_second_per_gpu": 4244.79, "total_tokens": 4872365 }, { "epoch": 0.16879219804951237, "grad_norm": 0.1827855408191681, "learning_rate": 0.00019950052545447352, "loss": 0.6347, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8865, "step": 225, "tokens_per_second_per_gpu": 4252.71, "total_tokens": 5319322 }, { "epoch": 0.18754688672168043, "grad_norm": 0.16483066976070404, "learning_rate": 0.00019927908861191827, "loss": 0.6392, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.895, "step": 250, "tokens_per_second_per_gpu": 3772.0, "total_tokens": 5768644 }, { "epoch": 0.20630157539384847, "grad_norm": 0.17186357080936432, "learning_rate": 0.00019901730721337302, "loss": 0.614, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8478, "step": 275, "tokens_per_second_per_gpu": 4281.82, "total_tokens": 6220751 }, { "epoch": 0.2250562640660165, "grad_norm": 0.18073013424873352, "learning_rate": 0.00019871528764029667, "loss": 0.6196, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8582, "step": 300, "tokens_per_second_per_gpu": 4234.51, "total_tokens": 6668111 }, { "epoch": 0.24381095273818454, "grad_norm": 0.19639697670936584, "learning_rate": 0.00019837315262594306, "loss": 0.6181, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8554, "step": 325, "tokens_per_second_per_gpu": 4261.44, "total_tokens": 7117439 }, { "epoch": 0.2625656414103526, "grad_norm": 0.1670486479997635, "learning_rate": 0.00019799104120548492, "loss": 0.6141, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.848, "step": 350, "tokens_per_second_per_gpu": 4298.97, "total_tokens": 7569060 }, { "epoch": 0.2813203300825206, "grad_norm": 0.17752495408058167, "learning_rate": 0.00019756910865951377, "loss": 0.6075, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8358, "step": 375, "tokens_per_second_per_gpu": 4256.6, "total_tokens": 8017630 }, { "epoch": 0.30007501875468867, "grad_norm": 0.2000180035829544, "learning_rate": 0.00019710752645093747, "loss": 0.6108, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8419, "step": 400, "tokens_per_second_per_gpu": 4245.46, "total_tokens": 8464998 }, { "epoch": 0.31882970742685673, "grad_norm": 0.17395919561386108, "learning_rate": 0.00019660648215530206, "loss": 0.5966, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8159, "step": 425, "tokens_per_second_per_gpu": 3758.92, "total_tokens": 8914723 }, { "epoch": 0.33758439609902474, "grad_norm": 0.18785236775875092, "learning_rate": 0.00019606617938456572, "loss": 0.6099, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8402, "step": 450, "tokens_per_second_per_gpu": 4200.53, "total_tokens": 9359638 }, { "epoch": 0.3563390847711928, "grad_norm": 0.17702797055244446, "learning_rate": 0.0001954868377043559, "loss": 0.5922, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.808, "step": 475, "tokens_per_second_per_gpu": 4265.36, "total_tokens": 9810837 }, { "epoch": 0.37509377344336087, "grad_norm": 0.19927558302879333, "learning_rate": 0.00019486869254474337, "loss": 0.5759, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7787, "step": 500, "tokens_per_second_per_gpu": 4276.25, "total_tokens": 10261446 }, { "epoch": 0.3938484621155289, "grad_norm": 0.1908370852470398, "learning_rate": 0.0001942119951045692, "loss": 0.584, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7932, "step": 525, "tokens_per_second_per_gpu": 4272.28, "total_tokens": 10707841 }, { "epoch": 0.41260315078769694, "grad_norm": 0.2064146101474762, "learning_rate": 0.00019351701224936383, "loss": 0.5791, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7844, "step": 550, "tokens_per_second_per_gpu": 4250.37, "total_tokens": 11155384 }, { "epoch": 0.43135783945986494, "grad_norm": 0.26748332381248474, "learning_rate": 0.0001927840264028995, "loss": 0.5758, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7786, "step": 575, "tokens_per_second_per_gpu": 4256.55, "total_tokens": 11601192 }, { "epoch": 0.450112528132033, "grad_norm": 0.17514832317829132, "learning_rate": 0.00019201333543242036, "loss": 0.5791, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7844, "step": 600, "tokens_per_second_per_gpu": 3770.83, "total_tokens": 12048477 }, { "epoch": 0.46886721680420107, "grad_norm": 0.22069169580936432, "learning_rate": 0.00019120525252759647, "loss": 0.5803, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7866, "step": 625, "tokens_per_second_per_gpu": 4179.31, "total_tokens": 12488141 }, { "epoch": 0.4876219054763691, "grad_norm": 0.20555566251277924, "learning_rate": 0.00019036010607325138, "loss": 0.5716, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7711, "step": 650, "tokens_per_second_per_gpu": 4209.96, "total_tokens": 12934358 }, { "epoch": 0.5063765941485371, "grad_norm": 0.19018156826496124, "learning_rate": 0.00018947823951591478, "loss": 0.5608, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7521, "step": 675, "tokens_per_second_per_gpu": 4226.4, "total_tokens": 13378983 }, { "epoch": 0.5251312828207052, "grad_norm": 0.17173859477043152, "learning_rate": 0.00018856001122425416, "loss": 0.5667, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7624, "step": 700, "tokens_per_second_per_gpu": 4265.57, "total_tokens": 13829519 }, { "epoch": 0.5438859714928732, "grad_norm": 0.17706550657749176, "learning_rate": 0.0001876057943434428, "loss": 0.565, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7594, "step": 725, "tokens_per_second_per_gpu": 4281.61, "total_tokens": 14281879 }, { "epoch": 0.5626406601650412, "grad_norm": 0.18528586626052856, "learning_rate": 0.00018661597664352284, "loss": 0.5666, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7623, "step": 750, "tokens_per_second_per_gpu": 4229.32, "total_tokens": 14725919 }, { "epoch": 0.5813953488372093, "grad_norm": 0.16790929436683655, "learning_rate": 0.00018559096036182516, "loss": 0.5633, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7565, "step": 775, "tokens_per_second_per_gpu": 3775.0, "total_tokens": 15175146 }, { "epoch": 0.6001500375093773, "grad_norm": 0.17511805891990662, "learning_rate": 0.00018453116203951005, "loss": 0.5664, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7619, "step": 800, "tokens_per_second_per_gpu": 4218.07, "total_tokens": 15619901 }, { "epoch": 0.6189047261815454, "grad_norm": 0.19853387773036957, "learning_rate": 0.0001834370123522954, "loss": 0.5646, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7587, "step": 825, "tokens_per_second_per_gpu": 4230.84, "total_tokens": 16066102 }, { "epoch": 0.6376594148537135, "grad_norm": 0.18872258067131042, "learning_rate": 0.00018230895593544056, "loss": 0.552, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7367, "step": 850, "tokens_per_second_per_gpu": 4222.33, "total_tokens": 16510696 }, { "epoch": 0.6564141035258815, "grad_norm": 0.9702818989753723, "learning_rate": 0.0001811474512030578, "loss": 0.5607, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7519, "step": 875, "tokens_per_second_per_gpu": 4200.39, "total_tokens": 16953918 }, { "epoch": 0.6751687921980495, "grad_norm": 0.17479568719863892, "learning_rate": 0.00017995297016182405, "loss": 0.564, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7577, "step": 900, "tokens_per_second_per_gpu": 4210.15, "total_tokens": 17396453 }, { "epoch": 0.6939234808702176, "grad_norm": 0.1948954463005066, "learning_rate": 0.0001787259982191692, "loss": 0.5511, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7352, "step": 925, "tokens_per_second_per_gpu": 4237.98, "total_tokens": 17841287 }, { "epoch": 0.7126781695423856, "grad_norm": 0.19541053473949432, "learning_rate": 0.00017746703398601872, "loss": 0.5532, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7388, "step": 950, "tokens_per_second_per_gpu": 3725.33, "total_tokens": 18283596 }, { "epoch": 0.7314328582145536, "grad_norm": 0.1818365603685379, "learning_rate": 0.0001761765890741701, "loss": 0.5521, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7369, "step": 975, "tokens_per_second_per_gpu": 4211.63, "total_tokens": 18726722 }, { "epoch": 0.7501875468867217, "grad_norm": 0.1838025599718094, "learning_rate": 0.00017485518788838705, "loss": 0.5511, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7352, "step": 1000, "tokens_per_second_per_gpu": 3962.4, "total_tokens": 19167258 }, { "epoch": 0.7501875468867217, "eval_loss": 0.540988564491272, "eval_ppl": 1.7177, "eval_runtime": 138.0264, "eval_samples_per_second": 5.289, "eval_steps_per_second": 1.058, "memory/device_reserved (GiB)": 139.02, "memory/max_active (GiB)": 19.1, "memory/max_allocated (GiB)": 19.1, "step": 1000 }, { "epoch": 0.7689422355588897, "grad_norm": 0.2199818342924118, "learning_rate": 0.00017350336741329413, "loss": 0.549, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7315, "step": 1025, "tokens_per_second_per_gpu": 4129.73, "total_tokens": 20870820 }, { "epoch": 0.7876969242310577, "grad_norm": 0.19783177971839905, "learning_rate": 0.0001721216769951596, "loss": 0.5615, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7533, "step": 1050, "tokens_per_second_per_gpu": 4243.63, "total_tokens": 21317982 }, { "epoch": 0.8064516129032258, "grad_norm": 0.1678430140018463, "learning_rate": 0.00017071067811865476, "loss": 0.5557, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7432, "step": 1075, "tokens_per_second_per_gpu": 4092.04, "total_tokens": 21754087 }, { "epoch": 0.8252063015753939, "grad_norm": 0.16523879766464233, "learning_rate": 0.00016927094417868048, "loss": 0.556, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7437, "step": 1100, "tokens_per_second_per_gpu": 4187.02, "total_tokens": 22198779 }, { "epoch": 0.8439609902475619, "grad_norm": 0.18177717924118042, "learning_rate": 0.00016780306024735382, "loss": 0.5468, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7277, "step": 1125, "tokens_per_second_per_gpu": 4198.97, "total_tokens": 22639769 }, { "epoch": 0.8627156789197299, "grad_norm": 0.17299720644950867, "learning_rate": 0.0001663076228362492, "loss": 0.554, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7402, "step": 1150, "tokens_per_second_per_gpu": 3762.13, "total_tokens": 23086742 }, { "epoch": 0.881470367591898, "grad_norm": 0.19112971425056458, "learning_rate": 0.00016478523965399085, "loss": 0.5434, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7219, "step": 1175, "tokens_per_second_per_gpu": 4205.37, "total_tokens": 23528106 }, { "epoch": 0.900225056264066, "grad_norm": 0.17930163443088531, "learning_rate": 0.00016323652935929536, "loss": 0.5362, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7095, "step": 1200, "tokens_per_second_per_gpu": 4228.83, "total_tokens": 23974427 }, { "epoch": 0.918979744936234, "grad_norm": 0.18718039989471436, "learning_rate": 0.00016166212130956382, "loss": 0.5533, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.739, "step": 1225, "tokens_per_second_per_gpu": 4211.64, "total_tokens": 24415919 }, { "epoch": 0.9377344336084021, "grad_norm": 0.17105573415756226, "learning_rate": 0.0001600626553051268, "loss": 0.5492, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7319, "step": 1250, "tokens_per_second_per_gpu": 4183.86, "total_tokens": 24854345 }, { "epoch": 0.9564891222805701, "grad_norm": 0.1733955442905426, "learning_rate": 0.0001584387813292454, "loss": 0.5348, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7071, "step": 1275, "tokens_per_second_per_gpu": 4172.93, "total_tokens": 25292647 }, { "epoch": 0.9752438109527382, "grad_norm": 0.1858205944299698, "learning_rate": 0.00015679115928397401, "loss": 0.5527, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7379, "step": 1300, "tokens_per_second_per_gpu": 4226.34, "total_tokens": 25733591 }, { "epoch": 0.9939984996249063, "grad_norm": 0.1944192498922348, "learning_rate": 0.00015512045872199276, "loss": 0.5311, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7008, "step": 1325, "tokens_per_second_per_gpu": 3655.12, "total_tokens": 26164528 }, { "epoch": 1.0127531882970742, "grad_norm": 0.18358173966407776, "learning_rate": 0.00015342735857451777, "loss": 0.5145, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6728, "step": 1350, "tokens_per_second_per_gpu": 4227.25, "total_tokens": 26610460 }, { "epoch": 1.0315078769692423, "grad_norm": 0.1853465735912323, "learning_rate": 0.00015171254687540038, "loss": 0.5081, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6621, "step": 1375, "tokens_per_second_per_gpu": 4318.88, "total_tokens": 27064008 }, { "epoch": 1.0502625656414104, "grad_norm": 0.18925060331821442, "learning_rate": 0.0001499767204815273, "loss": 0.5185, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6795, "step": 1400, "tokens_per_second_per_gpu": 4324.01, "total_tokens": 27516590 }, { "epoch": 1.0690172543135783, "grad_norm": 0.20961470901966095, "learning_rate": 0.00014822058478963532, "loss": 0.5234, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6878, "step": 1425, "tokens_per_second_per_gpu": 4319.64, "total_tokens": 27970075 }, { "epoch": 1.0877719429857464, "grad_norm": 0.1982697695493698, "learning_rate": 0.0001464448534496555, "loss": 0.5169, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6768, "step": 1450, "tokens_per_second_per_gpu": 4267.88, "total_tokens": 28419716 }, { "epoch": 1.1065266316579145, "grad_norm": 0.1925143301486969, "learning_rate": 0.00014465024807470376, "loss": 0.5197, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6815, "step": 1475, "tokens_per_second_per_gpu": 4264.53, "total_tokens": 28866312 }, { "epoch": 1.1252813203300824, "grad_norm": 0.18788637220859528, "learning_rate": 0.0001428374979478349, "loss": 0.5204, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6827, "step": 1500, "tokens_per_second_per_gpu": 3779.33, "total_tokens": 29315968 }, { "epoch": 1.1440360090022506, "grad_norm": 0.18954145908355713, "learning_rate": 0.00014100733972568038, "loss": 0.5164, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.676, "step": 1525, "tokens_per_second_per_gpu": 4282.57, "total_tokens": 29766723 }, { "epoch": 1.1627906976744187, "grad_norm": 0.19003146886825562, "learning_rate": 0.00013916051713908924, "loss": 0.5095, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6645, "step": 1550, "tokens_per_second_per_gpu": 4290.76, "total_tokens": 30218573 }, { "epoch": 1.1815453863465866, "grad_norm": 0.18279583752155304, "learning_rate": 0.00013729778069089437, "loss": 0.522, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6854, "step": 1575, "tokens_per_second_per_gpu": 4300.13, "total_tokens": 30669810 }, { "epoch": 1.2003000750187547, "grad_norm": 0.18783092498779297, "learning_rate": 0.00013541988735092672, "loss": 0.5003, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6492, "step": 1600, "tokens_per_second_per_gpu": 4271.27, "total_tokens": 31117586 }, { "epoch": 1.2190547636909228, "grad_norm": 0.199558824300766, "learning_rate": 0.00013352760024840175, "loss": 0.5115, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6678, "step": 1625, "tokens_per_second_per_gpu": 4248.14, "total_tokens": 31562224 }, { "epoch": 1.2378094523630907, "grad_norm": 0.19465653598308563, "learning_rate": 0.00013162168836180246, "loss": 0.4967, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6433, "step": 1650, "tokens_per_second_per_gpu": 4286.24, "total_tokens": 32011071 }, { "epoch": 1.2565641410352588, "grad_norm": 0.2054641842842102, "learning_rate": 0.00012970292620638574, "loss": 0.5172, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6773, "step": 1675, "tokens_per_second_per_gpu": 3733.1, "total_tokens": 32452490 }, { "epoch": 1.275318829707427, "grad_norm": 0.19450411200523376, "learning_rate": 0.00012777209351943862, "loss": 0.5149, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6735, "step": 1700, "tokens_per_second_per_gpu": 4251.33, "total_tokens": 32899103 }, { "epoch": 1.2940735183795948, "grad_norm": 0.19844166934490204, "learning_rate": 0.0001258299749434123, "loss": 0.5205, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6829, "step": 1725, "tokens_per_second_per_gpu": 4240.57, "total_tokens": 33344569 }, { "epoch": 1.312828207051763, "grad_norm": 0.19240470230579376, "learning_rate": 0.00012387735970706312, "loss": 0.5033, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6542, "step": 1750, "tokens_per_second_per_gpu": 4267.65, "total_tokens": 33790426 }, { "epoch": 1.331582895723931, "grad_norm": 0.18220192193984985, "learning_rate": 0.00012191504130472937, "loss": 0.5103, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6658, "step": 1775, "tokens_per_second_per_gpu": 4237.08, "total_tokens": 34233908 }, { "epoch": 1.350337584396099, "grad_norm": 0.20157551765441895, "learning_rate": 0.00011994381717387514, "loss": 0.5192, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6807, "step": 1800, "tokens_per_second_per_gpu": 4244.09, "total_tokens": 34678691 }, { "epoch": 1.369092273068267, "grad_norm": 0.17189238965511322, "learning_rate": 0.00011796448837103129, "loss": 0.5011, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6505, "step": 1825, "tokens_per_second_per_gpu": 4277.26, "total_tokens": 35125624 }, { "epoch": 1.387846961740435, "grad_norm": 0.19443106651306152, "learning_rate": 0.00011597785924626616, "loss": 0.4994, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6477, "step": 1850, "tokens_per_second_per_gpu": 3766.52, "total_tokens": 35568850 }, { "epoch": 1.406601650412603, "grad_norm": 0.1810811311006546, "learning_rate": 0.00011398473711631764, "loss": 0.5083, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6625, "step": 1875, "tokens_per_second_per_gpu": 4204.76, "total_tokens": 36009980 }, { "epoch": 1.4253563390847712, "grad_norm": 0.19805970788002014, "learning_rate": 0.00011198593193651958, "loss": 0.5141, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6721, "step": 1900, "tokens_per_second_per_gpu": 4270.21, "total_tokens": 36457032 }, { "epoch": 1.4441110277569393, "grad_norm": 0.1936168372631073, "learning_rate": 0.00010998225597165628, "loss": 0.5045, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6562, "step": 1925, "tokens_per_second_per_gpu": 4275.24, "total_tokens": 36905590 }, { "epoch": 1.4628657164291072, "grad_norm": 0.19065748155117035, "learning_rate": 0.00010797452346587798, "loss": 0.5025, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6528, "step": 1950, "tokens_per_second_per_gpu": 4285.81, "total_tokens": 37354436 }, { "epoch": 1.4816204051012754, "grad_norm": 0.18647657334804535, "learning_rate": 0.0001059635503118125, "loss": 0.5102, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6656, "step": 1975, "tokens_per_second_per_gpu": 4259.76, "total_tokens": 37801500 }, { "epoch": 1.5003750937734432, "grad_norm": 0.21211788058280945, "learning_rate": 0.00010395015371900663, "loss": 0.5052, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6573, "step": 2000, "tokens_per_second_per_gpu": 4250.7, "total_tokens": 38244936 }, { "epoch": 1.5003750937734432, "eval_loss": 0.5063687562942505, "eval_ppl": 1.6593, "eval_runtime": 141.112, "eval_samples_per_second": 5.173, "eval_steps_per_second": 1.035, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 19.1, "memory/max_allocated (GiB)": 19.1, "step": 2000 }, { "epoch": 1.5191297824456114, "grad_norm": 0.20089760422706604, "learning_rate": 0.00010193515188183245, "loss": 0.4892, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.631, "step": 2025, "tokens_per_second_per_gpu": 4246.58, "total_tokens": 39959888 }, { "epoch": 1.5378844711177795, "grad_norm": 0.19840118288993835, "learning_rate": 9.991936364699348e-05, "loss": 0.503, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6537, "step": 2050, "tokens_per_second_per_gpu": 4320.38, "total_tokens": 40411902 }, { "epoch": 1.5566391597899476, "grad_norm": 0.20045842230319977, "learning_rate": 9.790360818076577e-05, "loss": 0.5127, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6698, "step": 2075, "tokens_per_second_per_gpu": 4245.02, "total_tokens": 40855384 }, { "epoch": 1.5753938484621155, "grad_norm": 0.19669026136398315, "learning_rate": 9.588870463610893e-05, "loss": 0.4994, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6477, "step": 2100, "tokens_per_second_per_gpu": 4174.18, "total_tokens": 41293525 }, { "epoch": 1.5941485371342836, "grad_norm": 0.19754259288311005, "learning_rate": 9.387547181978291e-05, "loss": 0.5009, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6502, "step": 2125, "tokens_per_second_per_gpu": 4200.06, "total_tokens": 41737747 }, { "epoch": 1.6129032258064515, "grad_norm": 0.19482502341270447, "learning_rate": 9.186472785960507e-05, "loss": 0.5002, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6491, "step": 2150, "tokens_per_second_per_gpu": 3696.76, "total_tokens": 42176082 }, { "epoch": 1.6316579144786196, "grad_norm": 0.21606561541557312, "learning_rate": 8.985728987198352e-05, "loss": 0.4959, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.642, "step": 2175, "tokens_per_second_per_gpu": 4192.5, "total_tokens": 42616372 }, { "epoch": 1.6504126031507877, "grad_norm": 0.1979638934135437, "learning_rate": 8.785397362986114e-05, "loss": 0.5031, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6538, "step": 2200, "tokens_per_second_per_gpu": 4211.67, "total_tokens": 43058315 }, { "epoch": 1.6691672918229559, "grad_norm": 0.20717743039131165, "learning_rate": 8.58555932312059e-05, "loss": 0.4986, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6464, "step": 2225, "tokens_per_second_per_gpu": 4242.04, "total_tokens": 43501960 }, { "epoch": 1.6879219804951238, "grad_norm": 0.18736609816551208, "learning_rate": 8.38629607681815e-05, "loss": 0.4898, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.632, "step": 2250, "tokens_per_second_per_gpu": 4235.21, "total_tokens": 43947235 }, { "epoch": 1.7066766691672917, "grad_norm": 0.2056591659784317, "learning_rate": 8.187688599713333e-05, "loss": 0.4925, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6364, "step": 2275, "tokens_per_second_per_gpu": 4256.41, "total_tokens": 44393451 }, { "epoch": 1.7254313578394598, "grad_norm": 0.19774597883224487, "learning_rate": 7.989817600952376e-05, "loss": 0.4952, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6408, "step": 2300, "tokens_per_second_per_gpu": 4224.5, "total_tokens": 44836590 }, { "epoch": 1.744186046511628, "grad_norm": 0.19662383198738098, "learning_rate": 7.792763490394984e-05, "loss": 0.4977, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6449, "step": 2325, "tokens_per_second_per_gpu": 3741.52, "total_tokens": 45279799 }, { "epoch": 1.762940735183796, "grad_norm": 0.19400179386138916, "learning_rate": 7.596606345937812e-05, "loss": 0.4965, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.643, "step": 2350, "tokens_per_second_per_gpu": 4248.51, "total_tokens": 45725602 }, { "epoch": 1.7816954238559641, "grad_norm": 0.20261766016483307, "learning_rate": 7.401425880972742e-05, "loss": 0.5014, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.651, "step": 2375, "tokens_per_second_per_gpu": 4216.2, "total_tokens": 46167730 }, { "epoch": 1.800450112528132, "grad_norm": 0.20447255671024323, "learning_rate": 7.207301411993387e-05, "loss": 0.4901, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6325, "step": 2400, "tokens_per_second_per_gpu": 3727.37, "total_tokens": 46611126 }, { "epoch": 1.8192048012003, "grad_norm": 0.19921696186065674, "learning_rate": 7.014311826362804e-05, "loss": 0.4925, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6364, "step": 2425, "tokens_per_second_per_gpu": 4202.19, "total_tokens": 47050763 }, { "epoch": 1.837959489872468, "grad_norm": 0.20095540583133698, "learning_rate": 6.822535550255652e-05, "loss": 0.494, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6389, "step": 2450, "tokens_per_second_per_gpu": 4230.16, "total_tokens": 47496926 }, { "epoch": 1.8567141785446362, "grad_norm": 0.20210741460323334, "learning_rate": 6.632050516787719e-05, "loss": 0.5036, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6547, "step": 2475, "tokens_per_second_per_gpu": 4256.1, "total_tokens": 47941250 }, { "epoch": 1.8754688672168043, "grad_norm": 0.21025419235229492, "learning_rate": 6.442934134345871e-05, "loss": 0.5019, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6519, "step": 2500, "tokens_per_second_per_gpu": 3728.09, "total_tokens": 48383306 }, { "epoch": 1.8942235558889724, "grad_norm": 0.20130059123039246, "learning_rate": 6.255263255131172e-05, "loss": 0.5022, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6524, "step": 2525, "tokens_per_second_per_gpu": 4178.95, "total_tokens": 48821862 }, { "epoch": 1.9129782445611403, "grad_norm": 0.19601669907569885, "learning_rate": 6.0691141439280785e-05, "loss": 0.4876, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6284, "step": 2550, "tokens_per_second_per_gpu": 3998.52, "total_tokens": 49262344 }, { "epoch": 1.9317329332333082, "grad_norm": 0.20538586378097534, "learning_rate": 5.884562447112331e-05, "loss": 0.4796, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6154, "step": 2575, "tokens_per_second_per_gpu": 4192.8, "total_tokens": 49702209 }, { "epoch": 1.9504876219054763, "grad_norm": 0.19957959651947021, "learning_rate": 5.701683161910115e-05, "loss": 0.5017, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6515, "step": 2600, "tokens_per_second_per_gpu": 4244.94, "total_tokens": 50147673 }, { "epoch": 1.9692423105776444, "grad_norm": 0.20284536480903625, "learning_rate": 5.520550605921091e-05, "loss": 0.5024, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6527, "step": 2625, "tokens_per_second_per_gpu": 4205.45, "total_tokens": 50589478 }, { "epoch": 1.9879969992498125, "grad_norm": 0.2044789344072342, "learning_rate": 5.34123838691753e-05, "loss": 0.4967, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6433, "step": 2650, "tokens_per_second_per_gpu": 4204.9, "total_tokens": 51027800 }, { "epoch": 2.0067516879219807, "grad_norm": 0.2125943899154663, "learning_rate": 5.163819372931979e-05, "loss": 0.4862, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6261, "step": 2675, "tokens_per_second_per_gpu": 3745.54, "total_tokens": 51469941 }, { "epoch": 2.0255063765941483, "grad_norm": 0.2312517911195755, "learning_rate": 4.9883656626454724e-05, "loss": 0.4782, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6132, "step": 2700, "tokens_per_second_per_gpu": 4275.5, "total_tokens": 51921057 }, { "epoch": 2.0442610652663165, "grad_norm": 0.19745635986328125, "learning_rate": 4.81494855608843e-05, "loss": 0.4717, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6027, "step": 2725, "tokens_per_second_per_gpu": 4290.88, "total_tokens": 52372623 }, { "epoch": 2.0630157539384846, "grad_norm": 0.22817276418209076, "learning_rate": 4.643638525666095e-05, "loss": 0.4817, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6188, "step": 2750, "tokens_per_second_per_gpu": 4292.31, "total_tokens": 52823263 }, { "epoch": 2.0817704426106527, "grad_norm": 0.20878754556179047, "learning_rate": 4.4745051875203134e-05, "loss": 0.4774, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6119, "step": 2775, "tokens_per_second_per_gpu": 4287.12, "total_tokens": 53272669 }, { "epoch": 2.100525131282821, "grad_norm": 0.18676196038722992, "learning_rate": 4.307617273239226e-05, "loss": 0.4824, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.62, "step": 2800, "tokens_per_second_per_gpu": 4304.14, "total_tokens": 53724750 }, { "epoch": 2.119279819954989, "grad_norm": 0.20670537650585175, "learning_rate": 4.1430426019264924e-05, "loss": 0.4701, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6002, "step": 2825, "tokens_per_second_per_gpu": 4283.76, "total_tokens": 54172957 }, { "epoch": 2.1380345086271566, "grad_norm": 0.21445906162261963, "learning_rate": 3.980848052641286e-05, "loss": 0.4772, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6116, "step": 2850, "tokens_per_second_per_gpu": 3768.93, "total_tokens": 54625827 }, { "epoch": 2.1567891972993247, "grad_norm": 0.21021129190921783, "learning_rate": 3.8210995372202896e-05, "loss": 0.471, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6016, "step": 2875, "tokens_per_second_per_gpu": 4286.55, "total_tokens": 55076031 }, { "epoch": 2.175543885971493, "grad_norm": 0.23069453239440918, "learning_rate": 3.663861973492776e-05, "loss": 0.4722, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6035, "step": 2900, "tokens_per_second_per_gpu": 4291.53, "total_tokens": 55527864 }, { "epoch": 2.194298574643661, "grad_norm": 0.22328485548496246, "learning_rate": 3.509199258899603e-05, "loss": 0.474, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6064, "step": 2925, "tokens_per_second_per_gpu": 4262.17, "total_tokens": 55976245 }, { "epoch": 2.213053263315829, "grad_norm": 0.20422938466072083, "learning_rate": 3.3571742445268995e-05, "loss": 0.4721, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6034, "step": 2950, "tokens_per_second_per_gpu": 4339.03, "total_tokens": 56430293 }, { "epoch": 2.231807951987997, "grad_norm": 0.21462033689022064, "learning_rate": 3.2078487095649236e-05, "loss": 0.4798, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6158, "step": 2975, "tokens_per_second_per_gpu": 4274.93, "total_tokens": 56879796 }, { "epoch": 2.250562640660165, "grad_norm": 0.21800526976585388, "learning_rate": 3.061283336202545e-05, "loss": 0.4733, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6053, "step": 3000, "tokens_per_second_per_gpu": 4290.7, "total_tokens": 57329902 }, { "epoch": 2.250562640660165, "eval_loss": 0.49272674322128296, "eval_ppl": 1.6368, "eval_runtime": 139.4189, "eval_samples_per_second": 5.236, "eval_steps_per_second": 1.047, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 19.1, "memory/max_allocated (GiB)": 19.1, "step": 3000 } ], "logging_steps": 25, "max_steps": 3996, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.44194006269952e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }