{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5003750937734432, "eval_steps": 1000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 1.6887853145599365, "eval_ppl": 5.4129, "eval_runtime": 167.3526, "eval_samples_per_second": 4.362, "eval_steps_per_second": 0.872, "memory/device_reserved (GiB)": 139.12, "memory/max_active (GiB)": 18.94, "memory/max_allocated (GiB)": 18.94, "step": 0 }, { "epoch": 0.018754688672168042, "grad_norm": 1.415561556816101, "learning_rate": 4.8e-05, "loss": 1.6848, "memory/device_reserved (GiB)": 139.11, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 5.3914, "step": 25, "tokens_per_second_per_gpu": 16277.76, "total_tokens": 1723633 }, { "epoch": 0.037509377344336084, "grad_norm": 0.33179354667663574, "learning_rate": 9.8e-05, "loss": 0.9839, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 2.6749, "step": 50, "tokens_per_second_per_gpu": 4303.21, "total_tokens": 2175386 }, { "epoch": 0.056264066016504126, "grad_norm": 0.17453454434871674, "learning_rate": 0.000148, "loss": 0.8002, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 2.226, "step": 75, "tokens_per_second_per_gpu": 3776.03, "total_tokens": 2623712 }, { "epoch": 0.07501875468867217, "grad_norm": 0.19318008422851562, "learning_rate": 0.00019800000000000002, "loss": 0.7218, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 2.0581, "step": 100, "tokens_per_second_per_gpu": 4252.49, "total_tokens": 3072519 }, { "epoch": 0.09377344336084022, "grad_norm": 0.18435686826705933, "learning_rate": 0.00019998127418269004, "loss": 0.6759, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.9658, "step": 125, "tokens_per_second_per_gpu": 4303.31, "total_tokens": 3523983 }, { "epoch": 0.11252813203300825, "grad_norm": 0.19870473444461823, "learning_rate": 0.00019992195096972548, "loss": 0.6703, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.9548, "step": 150, "tokens_per_second_per_gpu": 4260.86, "total_tokens": 3973452 }, { "epoch": 0.1312828207051763, "grad_norm": 0.20499658584594727, "learning_rate": 0.0001998220219574743, "loss": 0.6381, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8929, "step": 175, "tokens_per_second_per_gpu": 4288.64, "total_tokens": 4423763 }, { "epoch": 0.15003750937734434, "grad_norm": 0.18934418261051178, "learning_rate": 0.00019968152775460537, "loss": 0.6383, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8933, "step": 200, "tokens_per_second_per_gpu": 4244.79, "total_tokens": 4872365 }, { "epoch": 0.16879219804951237, "grad_norm": 0.1827855408191681, "learning_rate": 0.00019950052545447352, "loss": 0.6347, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8865, "step": 225, "tokens_per_second_per_gpu": 4252.71, "total_tokens": 5319322 }, { "epoch": 0.18754688672168043, "grad_norm": 0.16483066976070404, "learning_rate": 0.00019927908861191827, "loss": 0.6392, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.895, "step": 250, "tokens_per_second_per_gpu": 3772.0, "total_tokens": 5768644 }, { "epoch": 0.20630157539384847, "grad_norm": 0.17186357080936432, "learning_rate": 0.00019901730721337302, "loss": 0.614, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8478, "step": 275, "tokens_per_second_per_gpu": 4281.82, "total_tokens": 6220751 }, { "epoch": 0.2250562640660165, "grad_norm": 0.18073013424873352, "learning_rate": 0.00019871528764029667, "loss": 0.6196, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8582, "step": 300, "tokens_per_second_per_gpu": 4234.51, "total_tokens": 6668111 }, { "epoch": 0.24381095273818454, "grad_norm": 0.19639697670936584, "learning_rate": 0.00019837315262594306, "loss": 0.6181, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8554, "step": 325, "tokens_per_second_per_gpu": 4261.44, "total_tokens": 7117439 }, { "epoch": 0.2625656414103526, "grad_norm": 0.1670486479997635, "learning_rate": 0.00019799104120548492, "loss": 0.6141, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.848, "step": 350, "tokens_per_second_per_gpu": 4298.97, "total_tokens": 7569060 }, { "epoch": 0.2813203300825206, "grad_norm": 0.17752495408058167, "learning_rate": 0.00019756910865951377, "loss": 0.6075, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8358, "step": 375, "tokens_per_second_per_gpu": 4256.6, "total_tokens": 8017630 }, { "epoch": 0.30007501875468867, "grad_norm": 0.2000180035829544, "learning_rate": 0.00019710752645093747, "loss": 0.6108, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8419, "step": 400, "tokens_per_second_per_gpu": 4245.46, "total_tokens": 8464998 }, { "epoch": 0.31882970742685673, "grad_norm": 0.17395919561386108, "learning_rate": 0.00019660648215530206, "loss": 0.5966, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8159, "step": 425, "tokens_per_second_per_gpu": 3758.92, "total_tokens": 8914723 }, { "epoch": 0.33758439609902474, "grad_norm": 0.18785236775875092, "learning_rate": 0.00019606617938456572, "loss": 0.6099, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.8402, "step": 450, "tokens_per_second_per_gpu": 4200.53, "total_tokens": 9359638 }, { "epoch": 0.3563390847711928, "grad_norm": 0.17702797055244446, "learning_rate": 0.0001954868377043559, "loss": 0.5922, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.808, "step": 475, "tokens_per_second_per_gpu": 4265.36, "total_tokens": 9810837 }, { "epoch": 0.37509377344336087, "grad_norm": 0.19927558302879333, "learning_rate": 0.00019486869254474337, "loss": 0.5759, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7787, "step": 500, "tokens_per_second_per_gpu": 4276.25, "total_tokens": 10261446 }, { "epoch": 0.3938484621155289, "grad_norm": 0.1908370852470398, "learning_rate": 0.0001942119951045692, "loss": 0.584, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7932, "step": 525, "tokens_per_second_per_gpu": 4272.28, "total_tokens": 10707841 }, { "epoch": 0.41260315078769694, "grad_norm": 0.2064146101474762, "learning_rate": 0.00019351701224936383, "loss": 0.5791, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7844, "step": 550, "tokens_per_second_per_gpu": 4250.37, "total_tokens": 11155384 }, { "epoch": 0.43135783945986494, "grad_norm": 0.26748332381248474, "learning_rate": 0.0001927840264028995, "loss": 0.5758, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7786, "step": 575, "tokens_per_second_per_gpu": 4256.55, "total_tokens": 11601192 }, { "epoch": 0.450112528132033, "grad_norm": 0.17514832317829132, "learning_rate": 0.00019201333543242036, "loss": 0.5791, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7844, "step": 600, "tokens_per_second_per_gpu": 3770.83, "total_tokens": 12048477 }, { "epoch": 0.46886721680420107, "grad_norm": 0.22069169580936432, "learning_rate": 0.00019120525252759647, "loss": 0.5803, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7866, "step": 625, "tokens_per_second_per_gpu": 4179.31, "total_tokens": 12488141 }, { "epoch": 0.4876219054763691, "grad_norm": 0.20555566251277924, "learning_rate": 0.00019036010607325138, "loss": 0.5716, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7711, "step": 650, "tokens_per_second_per_gpu": 4209.96, "total_tokens": 12934358 }, { "epoch": 0.5063765941485371, "grad_norm": 0.19018156826496124, "learning_rate": 0.00018947823951591478, "loss": 0.5608, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7521, "step": 675, "tokens_per_second_per_gpu": 4226.4, "total_tokens": 13378983 }, { "epoch": 0.5251312828207052, "grad_norm": 0.17173859477043152, "learning_rate": 0.00018856001122425416, "loss": 0.5667, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7624, "step": 700, "tokens_per_second_per_gpu": 4265.57, "total_tokens": 13829519 }, { "epoch": 0.5438859714928732, "grad_norm": 0.17706550657749176, "learning_rate": 0.0001876057943434428, "loss": 0.565, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7594, "step": 725, "tokens_per_second_per_gpu": 4281.61, "total_tokens": 14281879 }, { "epoch": 0.5626406601650412, "grad_norm": 0.18528586626052856, "learning_rate": 0.00018661597664352284, "loss": 0.5666, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7623, "step": 750, "tokens_per_second_per_gpu": 4229.32, "total_tokens": 14725919 }, { "epoch": 0.5813953488372093, "grad_norm": 0.16790929436683655, "learning_rate": 0.00018559096036182516, "loss": 0.5633, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7565, "step": 775, "tokens_per_second_per_gpu": 3775.0, "total_tokens": 15175146 }, { "epoch": 0.6001500375093773, "grad_norm": 0.17511805891990662, "learning_rate": 0.00018453116203951005, "loss": 0.5664, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7619, "step": 800, "tokens_per_second_per_gpu": 4218.07, "total_tokens": 15619901 }, { "epoch": 0.6189047261815454, "grad_norm": 0.19853387773036957, "learning_rate": 0.0001834370123522954, "loss": 0.5646, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7587, "step": 825, "tokens_per_second_per_gpu": 4230.84, "total_tokens": 16066102 }, { "epoch": 0.6376594148537135, "grad_norm": 0.18872258067131042, "learning_rate": 0.00018230895593544056, "loss": 0.552, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7367, "step": 850, "tokens_per_second_per_gpu": 4222.33, "total_tokens": 16510696 }, { "epoch": 0.6564141035258815, "grad_norm": 0.9702818989753723, "learning_rate": 0.0001811474512030578, "loss": 0.5607, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7519, "step": 875, "tokens_per_second_per_gpu": 4200.39, "total_tokens": 16953918 }, { "epoch": 0.6751687921980495, "grad_norm": 0.17479568719863892, "learning_rate": 0.00017995297016182405, "loss": 0.564, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7577, "step": 900, "tokens_per_second_per_gpu": 4210.15, "total_tokens": 17396453 }, { "epoch": 0.6939234808702176, "grad_norm": 0.1948954463005066, "learning_rate": 0.0001787259982191692, "loss": 0.5511, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7352, "step": 925, "tokens_per_second_per_gpu": 4237.98, "total_tokens": 17841287 }, { "epoch": 0.7126781695423856, "grad_norm": 0.19541053473949432, "learning_rate": 0.00017746703398601872, "loss": 0.5532, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7388, "step": 950, "tokens_per_second_per_gpu": 3725.33, "total_tokens": 18283596 }, { "epoch": 0.7314328582145536, "grad_norm": 0.1818365603685379, "learning_rate": 0.0001761765890741701, "loss": 0.5521, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7369, "step": 975, "tokens_per_second_per_gpu": 4211.63, "total_tokens": 18726722 }, { "epoch": 0.7501875468867217, "grad_norm": 0.1838025599718094, "learning_rate": 0.00017485518788838705, "loss": 0.5511, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7352, "step": 1000, "tokens_per_second_per_gpu": 3962.4, "total_tokens": 19167258 }, { "epoch": 0.7501875468867217, "eval_loss": 0.540988564491272, "eval_ppl": 1.7177, "eval_runtime": 138.0264, "eval_samples_per_second": 5.289, "eval_steps_per_second": 1.058, "memory/device_reserved (GiB)": 139.02, "memory/max_active (GiB)": 19.1, "memory/max_allocated (GiB)": 19.1, "step": 1000 }, { "epoch": 0.7689422355588897, "grad_norm": 0.2199818342924118, "learning_rate": 0.00017350336741329413, "loss": 0.549, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7315, "step": 1025, "tokens_per_second_per_gpu": 4129.73, "total_tokens": 20870820 }, { "epoch": 0.7876969242310577, "grad_norm": 0.19783177971839905, "learning_rate": 0.0001721216769951596, "loss": 0.5615, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7533, "step": 1050, "tokens_per_second_per_gpu": 4243.63, "total_tokens": 21317982 }, { "epoch": 0.8064516129032258, "grad_norm": 0.1678430140018463, "learning_rate": 0.00017071067811865476, "loss": 0.5557, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7432, "step": 1075, "tokens_per_second_per_gpu": 4092.04, "total_tokens": 21754087 }, { "epoch": 0.8252063015753939, "grad_norm": 0.16523879766464233, "learning_rate": 0.00016927094417868048, "loss": 0.556, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7437, "step": 1100, "tokens_per_second_per_gpu": 4187.02, "total_tokens": 22198779 }, { "epoch": 0.8439609902475619, "grad_norm": 0.18177717924118042, "learning_rate": 0.00016780306024735382, "loss": 0.5468, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7277, "step": 1125, "tokens_per_second_per_gpu": 4198.97, "total_tokens": 22639769 }, { "epoch": 0.8627156789197299, "grad_norm": 0.17299720644950867, "learning_rate": 0.0001663076228362492, "loss": 0.554, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7402, "step": 1150, "tokens_per_second_per_gpu": 3762.13, "total_tokens": 23086742 }, { "epoch": 0.881470367591898, "grad_norm": 0.19112971425056458, "learning_rate": 0.00016478523965399085, "loss": 0.5434, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7219, "step": 1175, "tokens_per_second_per_gpu": 4205.37, "total_tokens": 23528106 }, { "epoch": 0.900225056264066, "grad_norm": 0.17930163443088531, "learning_rate": 0.00016323652935929536, "loss": 0.5362, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7095, "step": 1200, "tokens_per_second_per_gpu": 4228.83, "total_tokens": 23974427 }, { "epoch": 0.918979744936234, "grad_norm": 0.18718039989471436, "learning_rate": 0.00016166212130956382, "loss": 0.5533, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.739, "step": 1225, "tokens_per_second_per_gpu": 4211.64, "total_tokens": 24415919 }, { "epoch": 0.9377344336084021, "grad_norm": 0.17105573415756226, "learning_rate": 0.0001600626553051268, "loss": 0.5492, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7319, "step": 1250, "tokens_per_second_per_gpu": 4183.86, "total_tokens": 24854345 }, { "epoch": 0.9564891222805701, "grad_norm": 0.1733955442905426, "learning_rate": 0.0001584387813292454, "loss": 0.5348, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7071, "step": 1275, "tokens_per_second_per_gpu": 4172.93, "total_tokens": 25292647 }, { "epoch": 0.9752438109527382, "grad_norm": 0.1858205944299698, "learning_rate": 0.00015679115928397401, "loss": 0.5527, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7379, "step": 1300, "tokens_per_second_per_gpu": 4226.34, "total_tokens": 25733591 }, { "epoch": 0.9939984996249063, "grad_norm": 0.1944192498922348, "learning_rate": 0.00015512045872199276, "loss": 0.5311, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.7008, "step": 1325, "tokens_per_second_per_gpu": 3655.12, "total_tokens": 26164528 }, { "epoch": 1.0127531882970742, "grad_norm": 0.18358173966407776, "learning_rate": 0.00015342735857451777, "loss": 0.5145, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6728, "step": 1350, "tokens_per_second_per_gpu": 4227.25, "total_tokens": 26610460 }, { "epoch": 1.0315078769692423, "grad_norm": 0.1853465735912323, "learning_rate": 0.00015171254687540038, "loss": 0.5081, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6621, "step": 1375, "tokens_per_second_per_gpu": 4318.88, "total_tokens": 27064008 }, { "epoch": 1.0502625656414104, "grad_norm": 0.18925060331821442, "learning_rate": 0.0001499767204815273, "loss": 0.5185, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6795, "step": 1400, "tokens_per_second_per_gpu": 4324.01, "total_tokens": 27516590 }, { "epoch": 1.0690172543135783, "grad_norm": 0.20961470901966095, "learning_rate": 0.00014822058478963532, "loss": 0.5234, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6878, "step": 1425, "tokens_per_second_per_gpu": 4319.64, "total_tokens": 27970075 }, { "epoch": 1.0877719429857464, "grad_norm": 0.1982697695493698, "learning_rate": 0.0001464448534496555, "loss": 0.5169, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6768, "step": 1450, "tokens_per_second_per_gpu": 4267.88, "total_tokens": 28419716 }, { "epoch": 1.1065266316579145, "grad_norm": 0.1925143301486969, "learning_rate": 0.00014465024807470376, "loss": 0.5197, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6815, "step": 1475, "tokens_per_second_per_gpu": 4264.53, "total_tokens": 28866312 }, { "epoch": 1.1252813203300824, "grad_norm": 0.18788637220859528, "learning_rate": 0.0001428374979478349, "loss": 0.5204, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6827, "step": 1500, "tokens_per_second_per_gpu": 3779.33, "total_tokens": 29315968 }, { "epoch": 1.1440360090022506, "grad_norm": 0.18954145908355713, "learning_rate": 0.00014100733972568038, "loss": 0.5164, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.676, "step": 1525, "tokens_per_second_per_gpu": 4282.57, "total_tokens": 29766723 }, { "epoch": 1.1627906976744187, "grad_norm": 0.19003146886825562, "learning_rate": 0.00013916051713908924, "loss": 0.5095, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6645, "step": 1550, "tokens_per_second_per_gpu": 4290.76, "total_tokens": 30218573 }, { "epoch": 1.1815453863465866, "grad_norm": 0.18279583752155304, "learning_rate": 0.00013729778069089437, "loss": 0.522, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6854, "step": 1575, "tokens_per_second_per_gpu": 4300.13, "total_tokens": 30669810 }, { "epoch": 1.2003000750187547, "grad_norm": 0.18783092498779297, "learning_rate": 0.00013541988735092672, "loss": 0.5003, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6492, "step": 1600, "tokens_per_second_per_gpu": 4271.27, "total_tokens": 31117586 }, { "epoch": 1.2190547636909228, "grad_norm": 0.199558824300766, "learning_rate": 0.00013352760024840175, "loss": 0.5115, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6678, "step": 1625, "tokens_per_second_per_gpu": 4248.14, "total_tokens": 31562224 }, { "epoch": 1.2378094523630907, "grad_norm": 0.19465653598308563, "learning_rate": 0.00013162168836180246, "loss": 0.4967, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6433, "step": 1650, "tokens_per_second_per_gpu": 4286.24, "total_tokens": 32011071 }, { "epoch": 1.2565641410352588, "grad_norm": 0.2054641842842102, "learning_rate": 0.00012970292620638574, "loss": 0.5172, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6773, "step": 1675, "tokens_per_second_per_gpu": 3733.1, "total_tokens": 32452490 }, { "epoch": 1.275318829707427, "grad_norm": 0.19450411200523376, "learning_rate": 0.00012777209351943862, "loss": 0.5149, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6735, "step": 1700, "tokens_per_second_per_gpu": 4251.33, "total_tokens": 32899103 }, { "epoch": 1.2940735183795948, "grad_norm": 0.19844166934490204, "learning_rate": 0.0001258299749434123, "loss": 0.5205, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6829, "step": 1725, "tokens_per_second_per_gpu": 4240.57, "total_tokens": 33344569 }, { "epoch": 1.312828207051763, "grad_norm": 0.19240470230579376, "learning_rate": 0.00012387735970706312, "loss": 0.5033, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6542, "step": 1750, "tokens_per_second_per_gpu": 4267.65, "total_tokens": 33790426 }, { "epoch": 1.331582895723931, "grad_norm": 0.18220192193984985, "learning_rate": 0.00012191504130472937, "loss": 0.5103, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6658, "step": 1775, "tokens_per_second_per_gpu": 4237.08, "total_tokens": 34233908 }, { "epoch": 1.350337584396099, "grad_norm": 0.20157551765441895, "learning_rate": 0.00011994381717387514, "loss": 0.5192, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6807, "step": 1800, "tokens_per_second_per_gpu": 4244.09, "total_tokens": 34678691 }, { "epoch": 1.369092273068267, "grad_norm": 0.17189238965511322, "learning_rate": 0.00011796448837103129, "loss": 0.5011, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6505, "step": 1825, "tokens_per_second_per_gpu": 4277.26, "total_tokens": 35125624 }, { "epoch": 1.387846961740435, "grad_norm": 0.19443106651306152, "learning_rate": 0.00011597785924626616, "loss": 0.4994, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6477, "step": 1850, "tokens_per_second_per_gpu": 3766.52, "total_tokens": 35568850 }, { "epoch": 1.406601650412603, "grad_norm": 0.1810811311006546, "learning_rate": 0.00011398473711631764, "loss": 0.5083, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6625, "step": 1875, "tokens_per_second_per_gpu": 4204.76, "total_tokens": 36009980 }, { "epoch": 1.4253563390847712, "grad_norm": 0.19805970788002014, "learning_rate": 0.00011198593193651958, "loss": 0.5141, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6721, "step": 1900, "tokens_per_second_per_gpu": 4270.21, "total_tokens": 36457032 }, { "epoch": 1.4441110277569393, "grad_norm": 0.1936168372631073, "learning_rate": 0.00010998225597165628, "loss": 0.5045, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6562, "step": 1925, "tokens_per_second_per_gpu": 4275.24, "total_tokens": 36905590 }, { "epoch": 1.4628657164291072, "grad_norm": 0.19065748155117035, "learning_rate": 0.00010797452346587798, "loss": 0.5025, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6528, "step": 1950, "tokens_per_second_per_gpu": 4285.81, "total_tokens": 37354436 }, { "epoch": 1.4816204051012754, "grad_norm": 0.18647657334804535, "learning_rate": 0.0001059635503118125, "loss": 0.5102, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6656, "step": 1975, "tokens_per_second_per_gpu": 4259.76, "total_tokens": 37801500 }, { "epoch": 1.5003750937734432, "grad_norm": 0.21211788058280945, "learning_rate": 0.00010395015371900663, "loss": 0.5052, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 25.53, "memory/max_allocated (GiB)": 25.53, "ppl": 1.6573, "step": 2000, "tokens_per_second_per_gpu": 4250.7, "total_tokens": 38244936 }, { "epoch": 1.5003750937734432, "eval_loss": 0.5063687562942505, "eval_ppl": 1.6593, "eval_runtime": 141.112, "eval_samples_per_second": 5.173, "eval_steps_per_second": 1.035, "memory/device_reserved (GiB)": 139.06, "memory/max_active (GiB)": 19.1, "memory/max_allocated (GiB)": 19.1, "step": 2000 } ], "logging_steps": 25, "max_steps": 3996, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.62796004179968e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }