| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.5003750937734432, | |
| "eval_steps": 1000, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_loss": 1.6887853145599365, | |
| "eval_ppl": 5.4129, | |
| "eval_runtime": 167.3526, | |
| "eval_samples_per_second": 4.362, | |
| "eval_steps_per_second": 0.872, | |
| "memory/device_reserved (GiB)": 139.12, | |
| "memory/max_active (GiB)": 18.94, | |
| "memory/max_allocated (GiB)": 18.94, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.018754688672168042, | |
| "grad_norm": 1.415561556816101, | |
| "learning_rate": 4.8e-05, | |
| "loss": 1.6848, | |
| "memory/device_reserved (GiB)": 139.11, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 5.3914, | |
| "step": 25, | |
| "tokens_per_second_per_gpu": 16277.76, | |
| "total_tokens": 1723633 | |
| }, | |
| { | |
| "epoch": 0.037509377344336084, | |
| "grad_norm": 0.33179354667663574, | |
| "learning_rate": 9.8e-05, | |
| "loss": 0.9839, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 2.6749, | |
| "step": 50, | |
| "tokens_per_second_per_gpu": 4303.21, | |
| "total_tokens": 2175386 | |
| }, | |
| { | |
| "epoch": 0.056264066016504126, | |
| "grad_norm": 0.17453454434871674, | |
| "learning_rate": 0.000148, | |
| "loss": 0.8002, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 2.226, | |
| "step": 75, | |
| "tokens_per_second_per_gpu": 3776.03, | |
| "total_tokens": 2623712 | |
| }, | |
| { | |
| "epoch": 0.07501875468867217, | |
| "grad_norm": 0.19318008422851562, | |
| "learning_rate": 0.00019800000000000002, | |
| "loss": 0.7218, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 2.0581, | |
| "step": 100, | |
| "tokens_per_second_per_gpu": 4252.49, | |
| "total_tokens": 3072519 | |
| }, | |
| { | |
| "epoch": 0.09377344336084022, | |
| "grad_norm": 0.18435686826705933, | |
| "learning_rate": 0.00019998127418269004, | |
| "loss": 0.6759, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.9658, | |
| "step": 125, | |
| "tokens_per_second_per_gpu": 4303.31, | |
| "total_tokens": 3523983 | |
| }, | |
| { | |
| "epoch": 0.11252813203300825, | |
| "grad_norm": 0.19870473444461823, | |
| "learning_rate": 0.00019992195096972548, | |
| "loss": 0.6703, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.9548, | |
| "step": 150, | |
| "tokens_per_second_per_gpu": 4260.86, | |
| "total_tokens": 3973452 | |
| }, | |
| { | |
| "epoch": 0.1312828207051763, | |
| "grad_norm": 0.20499658584594727, | |
| "learning_rate": 0.0001998220219574743, | |
| "loss": 0.6381, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.8929, | |
| "step": 175, | |
| "tokens_per_second_per_gpu": 4288.64, | |
| "total_tokens": 4423763 | |
| }, | |
| { | |
| "epoch": 0.15003750937734434, | |
| "grad_norm": 0.18934418261051178, | |
| "learning_rate": 0.00019968152775460537, | |
| "loss": 0.6383, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.8933, | |
| "step": 200, | |
| "tokens_per_second_per_gpu": 4244.79, | |
| "total_tokens": 4872365 | |
| }, | |
| { | |
| "epoch": 0.16879219804951237, | |
| "grad_norm": 0.1827855408191681, | |
| "learning_rate": 0.00019950052545447352, | |
| "loss": 0.6347, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.8865, | |
| "step": 225, | |
| "tokens_per_second_per_gpu": 4252.71, | |
| "total_tokens": 5319322 | |
| }, | |
| { | |
| "epoch": 0.18754688672168043, | |
| "grad_norm": 0.16483066976070404, | |
| "learning_rate": 0.00019927908861191827, | |
| "loss": 0.6392, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.895, | |
| "step": 250, | |
| "tokens_per_second_per_gpu": 3772.0, | |
| "total_tokens": 5768644 | |
| }, | |
| { | |
| "epoch": 0.20630157539384847, | |
| "grad_norm": 0.17186357080936432, | |
| "learning_rate": 0.00019901730721337302, | |
| "loss": 0.614, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.8478, | |
| "step": 275, | |
| "tokens_per_second_per_gpu": 4281.82, | |
| "total_tokens": 6220751 | |
| }, | |
| { | |
| "epoch": 0.2250562640660165, | |
| "grad_norm": 0.18073013424873352, | |
| "learning_rate": 0.00019871528764029667, | |
| "loss": 0.6196, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.8582, | |
| "step": 300, | |
| "tokens_per_second_per_gpu": 4234.51, | |
| "total_tokens": 6668111 | |
| }, | |
| { | |
| "epoch": 0.24381095273818454, | |
| "grad_norm": 0.19639697670936584, | |
| "learning_rate": 0.00019837315262594306, | |
| "loss": 0.6181, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.8554, | |
| "step": 325, | |
| "tokens_per_second_per_gpu": 4261.44, | |
| "total_tokens": 7117439 | |
| }, | |
| { | |
| "epoch": 0.2625656414103526, | |
| "grad_norm": 0.1670486479997635, | |
| "learning_rate": 0.00019799104120548492, | |
| "loss": 0.6141, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.848, | |
| "step": 350, | |
| "tokens_per_second_per_gpu": 4298.97, | |
| "total_tokens": 7569060 | |
| }, | |
| { | |
| "epoch": 0.2813203300825206, | |
| "grad_norm": 0.17752495408058167, | |
| "learning_rate": 0.00019756910865951377, | |
| "loss": 0.6075, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.8358, | |
| "step": 375, | |
| "tokens_per_second_per_gpu": 4256.6, | |
| "total_tokens": 8017630 | |
| }, | |
| { | |
| "epoch": 0.30007501875468867, | |
| "grad_norm": 0.2000180035829544, | |
| "learning_rate": 0.00019710752645093747, | |
| "loss": 0.6108, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.8419, | |
| "step": 400, | |
| "tokens_per_second_per_gpu": 4245.46, | |
| "total_tokens": 8464998 | |
| }, | |
| { | |
| "epoch": 0.31882970742685673, | |
| "grad_norm": 0.17395919561386108, | |
| "learning_rate": 0.00019660648215530206, | |
| "loss": 0.5966, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.8159, | |
| "step": 425, | |
| "tokens_per_second_per_gpu": 3758.92, | |
| "total_tokens": 8914723 | |
| }, | |
| { | |
| "epoch": 0.33758439609902474, | |
| "grad_norm": 0.18785236775875092, | |
| "learning_rate": 0.00019606617938456572, | |
| "loss": 0.6099, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.8402, | |
| "step": 450, | |
| "tokens_per_second_per_gpu": 4200.53, | |
| "total_tokens": 9359638 | |
| }, | |
| { | |
| "epoch": 0.3563390847711928, | |
| "grad_norm": 0.17702797055244446, | |
| "learning_rate": 0.0001954868377043559, | |
| "loss": 0.5922, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.808, | |
| "step": 475, | |
| "tokens_per_second_per_gpu": 4265.36, | |
| "total_tokens": 9810837 | |
| }, | |
| { | |
| "epoch": 0.37509377344336087, | |
| "grad_norm": 0.19927558302879333, | |
| "learning_rate": 0.00019486869254474337, | |
| "loss": 0.5759, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7787, | |
| "step": 500, | |
| "tokens_per_second_per_gpu": 4276.25, | |
| "total_tokens": 10261446 | |
| }, | |
| { | |
| "epoch": 0.3938484621155289, | |
| "grad_norm": 0.1908370852470398, | |
| "learning_rate": 0.0001942119951045692, | |
| "loss": 0.584, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7932, | |
| "step": 525, | |
| "tokens_per_second_per_gpu": 4272.28, | |
| "total_tokens": 10707841 | |
| }, | |
| { | |
| "epoch": 0.41260315078769694, | |
| "grad_norm": 0.2064146101474762, | |
| "learning_rate": 0.00019351701224936383, | |
| "loss": 0.5791, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7844, | |
| "step": 550, | |
| "tokens_per_second_per_gpu": 4250.37, | |
| "total_tokens": 11155384 | |
| }, | |
| { | |
| "epoch": 0.43135783945986494, | |
| "grad_norm": 0.26748332381248474, | |
| "learning_rate": 0.0001927840264028995, | |
| "loss": 0.5758, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7786, | |
| "step": 575, | |
| "tokens_per_second_per_gpu": 4256.55, | |
| "total_tokens": 11601192 | |
| }, | |
| { | |
| "epoch": 0.450112528132033, | |
| "grad_norm": 0.17514832317829132, | |
| "learning_rate": 0.00019201333543242036, | |
| "loss": 0.5791, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7844, | |
| "step": 600, | |
| "tokens_per_second_per_gpu": 3770.83, | |
| "total_tokens": 12048477 | |
| }, | |
| { | |
| "epoch": 0.46886721680420107, | |
| "grad_norm": 0.22069169580936432, | |
| "learning_rate": 0.00019120525252759647, | |
| "loss": 0.5803, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7866, | |
| "step": 625, | |
| "tokens_per_second_per_gpu": 4179.31, | |
| "total_tokens": 12488141 | |
| }, | |
| { | |
| "epoch": 0.4876219054763691, | |
| "grad_norm": 0.20555566251277924, | |
| "learning_rate": 0.00019036010607325138, | |
| "loss": 0.5716, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7711, | |
| "step": 650, | |
| "tokens_per_second_per_gpu": 4209.96, | |
| "total_tokens": 12934358 | |
| }, | |
| { | |
| "epoch": 0.5063765941485371, | |
| "grad_norm": 0.19018156826496124, | |
| "learning_rate": 0.00018947823951591478, | |
| "loss": 0.5608, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7521, | |
| "step": 675, | |
| "tokens_per_second_per_gpu": 4226.4, | |
| "total_tokens": 13378983 | |
| }, | |
| { | |
| "epoch": 0.5251312828207052, | |
| "grad_norm": 0.17173859477043152, | |
| "learning_rate": 0.00018856001122425416, | |
| "loss": 0.5667, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7624, | |
| "step": 700, | |
| "tokens_per_second_per_gpu": 4265.57, | |
| "total_tokens": 13829519 | |
| }, | |
| { | |
| "epoch": 0.5438859714928732, | |
| "grad_norm": 0.17706550657749176, | |
| "learning_rate": 0.0001876057943434428, | |
| "loss": 0.565, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7594, | |
| "step": 725, | |
| "tokens_per_second_per_gpu": 4281.61, | |
| "total_tokens": 14281879 | |
| }, | |
| { | |
| "epoch": 0.5626406601650412, | |
| "grad_norm": 0.18528586626052856, | |
| "learning_rate": 0.00018661597664352284, | |
| "loss": 0.5666, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7623, | |
| "step": 750, | |
| "tokens_per_second_per_gpu": 4229.32, | |
| "total_tokens": 14725919 | |
| }, | |
| { | |
| "epoch": 0.5813953488372093, | |
| "grad_norm": 0.16790929436683655, | |
| "learning_rate": 0.00018559096036182516, | |
| "loss": 0.5633, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7565, | |
| "step": 775, | |
| "tokens_per_second_per_gpu": 3775.0, | |
| "total_tokens": 15175146 | |
| }, | |
| { | |
| "epoch": 0.6001500375093773, | |
| "grad_norm": 0.17511805891990662, | |
| "learning_rate": 0.00018453116203951005, | |
| "loss": 0.5664, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7619, | |
| "step": 800, | |
| "tokens_per_second_per_gpu": 4218.07, | |
| "total_tokens": 15619901 | |
| }, | |
| { | |
| "epoch": 0.6189047261815454, | |
| "grad_norm": 0.19853387773036957, | |
| "learning_rate": 0.0001834370123522954, | |
| "loss": 0.5646, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7587, | |
| "step": 825, | |
| "tokens_per_second_per_gpu": 4230.84, | |
| "total_tokens": 16066102 | |
| }, | |
| { | |
| "epoch": 0.6376594148537135, | |
| "grad_norm": 0.18872258067131042, | |
| "learning_rate": 0.00018230895593544056, | |
| "loss": 0.552, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7367, | |
| "step": 850, | |
| "tokens_per_second_per_gpu": 4222.33, | |
| "total_tokens": 16510696 | |
| }, | |
| { | |
| "epoch": 0.6564141035258815, | |
| "grad_norm": 0.9702818989753723, | |
| "learning_rate": 0.0001811474512030578, | |
| "loss": 0.5607, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7519, | |
| "step": 875, | |
| "tokens_per_second_per_gpu": 4200.39, | |
| "total_tokens": 16953918 | |
| }, | |
| { | |
| "epoch": 0.6751687921980495, | |
| "grad_norm": 0.17479568719863892, | |
| "learning_rate": 0.00017995297016182405, | |
| "loss": 0.564, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7577, | |
| "step": 900, | |
| "tokens_per_second_per_gpu": 4210.15, | |
| "total_tokens": 17396453 | |
| }, | |
| { | |
| "epoch": 0.6939234808702176, | |
| "grad_norm": 0.1948954463005066, | |
| "learning_rate": 0.0001787259982191692, | |
| "loss": 0.5511, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7352, | |
| "step": 925, | |
| "tokens_per_second_per_gpu": 4237.98, | |
| "total_tokens": 17841287 | |
| }, | |
| { | |
| "epoch": 0.7126781695423856, | |
| "grad_norm": 0.19541053473949432, | |
| "learning_rate": 0.00017746703398601872, | |
| "loss": 0.5532, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7388, | |
| "step": 950, | |
| "tokens_per_second_per_gpu": 3725.33, | |
| "total_tokens": 18283596 | |
| }, | |
| { | |
| "epoch": 0.7314328582145536, | |
| "grad_norm": 0.1818365603685379, | |
| "learning_rate": 0.0001761765890741701, | |
| "loss": 0.5521, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7369, | |
| "step": 975, | |
| "tokens_per_second_per_gpu": 4211.63, | |
| "total_tokens": 18726722 | |
| }, | |
| { | |
| "epoch": 0.7501875468867217, | |
| "grad_norm": 0.1838025599718094, | |
| "learning_rate": 0.00017485518788838705, | |
| "loss": 0.5511, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7352, | |
| "step": 1000, | |
| "tokens_per_second_per_gpu": 3962.4, | |
| "total_tokens": 19167258 | |
| }, | |
| { | |
| "epoch": 0.7501875468867217, | |
| "eval_loss": 0.540988564491272, | |
| "eval_ppl": 1.7177, | |
| "eval_runtime": 138.0264, | |
| "eval_samples_per_second": 5.289, | |
| "eval_steps_per_second": 1.058, | |
| "memory/device_reserved (GiB)": 139.02, | |
| "memory/max_active (GiB)": 19.1, | |
| "memory/max_allocated (GiB)": 19.1, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.7689422355588897, | |
| "grad_norm": 0.2199818342924118, | |
| "learning_rate": 0.00017350336741329413, | |
| "loss": 0.549, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7315, | |
| "step": 1025, | |
| "tokens_per_second_per_gpu": 4129.73, | |
| "total_tokens": 20870820 | |
| }, | |
| { | |
| "epoch": 0.7876969242310577, | |
| "grad_norm": 0.19783177971839905, | |
| "learning_rate": 0.0001721216769951596, | |
| "loss": 0.5615, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7533, | |
| "step": 1050, | |
| "tokens_per_second_per_gpu": 4243.63, | |
| "total_tokens": 21317982 | |
| }, | |
| { | |
| "epoch": 0.8064516129032258, | |
| "grad_norm": 0.1678430140018463, | |
| "learning_rate": 0.00017071067811865476, | |
| "loss": 0.5557, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7432, | |
| "step": 1075, | |
| "tokens_per_second_per_gpu": 4092.04, | |
| "total_tokens": 21754087 | |
| }, | |
| { | |
| "epoch": 0.8252063015753939, | |
| "grad_norm": 0.16523879766464233, | |
| "learning_rate": 0.00016927094417868048, | |
| "loss": 0.556, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7437, | |
| "step": 1100, | |
| "tokens_per_second_per_gpu": 4187.02, | |
| "total_tokens": 22198779 | |
| }, | |
| { | |
| "epoch": 0.8439609902475619, | |
| "grad_norm": 0.18177717924118042, | |
| "learning_rate": 0.00016780306024735382, | |
| "loss": 0.5468, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7277, | |
| "step": 1125, | |
| "tokens_per_second_per_gpu": 4198.97, | |
| "total_tokens": 22639769 | |
| }, | |
| { | |
| "epoch": 0.8627156789197299, | |
| "grad_norm": 0.17299720644950867, | |
| "learning_rate": 0.0001663076228362492, | |
| "loss": 0.554, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7402, | |
| "step": 1150, | |
| "tokens_per_second_per_gpu": 3762.13, | |
| "total_tokens": 23086742 | |
| }, | |
| { | |
| "epoch": 0.881470367591898, | |
| "grad_norm": 0.19112971425056458, | |
| "learning_rate": 0.00016478523965399085, | |
| "loss": 0.5434, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7219, | |
| "step": 1175, | |
| "tokens_per_second_per_gpu": 4205.37, | |
| "total_tokens": 23528106 | |
| }, | |
| { | |
| "epoch": 0.900225056264066, | |
| "grad_norm": 0.17930163443088531, | |
| "learning_rate": 0.00016323652935929536, | |
| "loss": 0.5362, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7095, | |
| "step": 1200, | |
| "tokens_per_second_per_gpu": 4228.83, | |
| "total_tokens": 23974427 | |
| }, | |
| { | |
| "epoch": 0.918979744936234, | |
| "grad_norm": 0.18718039989471436, | |
| "learning_rate": 0.00016166212130956382, | |
| "loss": 0.5533, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.739, | |
| "step": 1225, | |
| "tokens_per_second_per_gpu": 4211.64, | |
| "total_tokens": 24415919 | |
| }, | |
| { | |
| "epoch": 0.9377344336084021, | |
| "grad_norm": 0.17105573415756226, | |
| "learning_rate": 0.0001600626553051268, | |
| "loss": 0.5492, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7319, | |
| "step": 1250, | |
| "tokens_per_second_per_gpu": 4183.86, | |
| "total_tokens": 24854345 | |
| }, | |
| { | |
| "epoch": 0.9564891222805701, | |
| "grad_norm": 0.1733955442905426, | |
| "learning_rate": 0.0001584387813292454, | |
| "loss": 0.5348, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7071, | |
| "step": 1275, | |
| "tokens_per_second_per_gpu": 4172.93, | |
| "total_tokens": 25292647 | |
| }, | |
| { | |
| "epoch": 0.9752438109527382, | |
| "grad_norm": 0.1858205944299698, | |
| "learning_rate": 0.00015679115928397401, | |
| "loss": 0.5527, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7379, | |
| "step": 1300, | |
| "tokens_per_second_per_gpu": 4226.34, | |
| "total_tokens": 25733591 | |
| }, | |
| { | |
| "epoch": 0.9939984996249063, | |
| "grad_norm": 0.1944192498922348, | |
| "learning_rate": 0.00015512045872199276, | |
| "loss": 0.5311, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.7008, | |
| "step": 1325, | |
| "tokens_per_second_per_gpu": 3655.12, | |
| "total_tokens": 26164528 | |
| }, | |
| { | |
| "epoch": 1.0127531882970742, | |
| "grad_norm": 0.18358173966407776, | |
| "learning_rate": 0.00015342735857451777, | |
| "loss": 0.5145, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6728, | |
| "step": 1350, | |
| "tokens_per_second_per_gpu": 4227.25, | |
| "total_tokens": 26610460 | |
| }, | |
| { | |
| "epoch": 1.0315078769692423, | |
| "grad_norm": 0.1853465735912323, | |
| "learning_rate": 0.00015171254687540038, | |
| "loss": 0.5081, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6621, | |
| "step": 1375, | |
| "tokens_per_second_per_gpu": 4318.88, | |
| "total_tokens": 27064008 | |
| }, | |
| { | |
| "epoch": 1.0502625656414104, | |
| "grad_norm": 0.18925060331821442, | |
| "learning_rate": 0.0001499767204815273, | |
| "loss": 0.5185, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6795, | |
| "step": 1400, | |
| "tokens_per_second_per_gpu": 4324.01, | |
| "total_tokens": 27516590 | |
| }, | |
| { | |
| "epoch": 1.0690172543135783, | |
| "grad_norm": 0.20961470901966095, | |
| "learning_rate": 0.00014822058478963532, | |
| "loss": 0.5234, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6878, | |
| "step": 1425, | |
| "tokens_per_second_per_gpu": 4319.64, | |
| "total_tokens": 27970075 | |
| }, | |
| { | |
| "epoch": 1.0877719429857464, | |
| "grad_norm": 0.1982697695493698, | |
| "learning_rate": 0.0001464448534496555, | |
| "loss": 0.5169, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6768, | |
| "step": 1450, | |
| "tokens_per_second_per_gpu": 4267.88, | |
| "total_tokens": 28419716 | |
| }, | |
| { | |
| "epoch": 1.1065266316579145, | |
| "grad_norm": 0.1925143301486969, | |
| "learning_rate": 0.00014465024807470376, | |
| "loss": 0.5197, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6815, | |
| "step": 1475, | |
| "tokens_per_second_per_gpu": 4264.53, | |
| "total_tokens": 28866312 | |
| }, | |
| { | |
| "epoch": 1.1252813203300824, | |
| "grad_norm": 0.18788637220859528, | |
| "learning_rate": 0.0001428374979478349, | |
| "loss": 0.5204, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6827, | |
| "step": 1500, | |
| "tokens_per_second_per_gpu": 3779.33, | |
| "total_tokens": 29315968 | |
| }, | |
| { | |
| "epoch": 1.1440360090022506, | |
| "grad_norm": 0.18954145908355713, | |
| "learning_rate": 0.00014100733972568038, | |
| "loss": 0.5164, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.676, | |
| "step": 1525, | |
| "tokens_per_second_per_gpu": 4282.57, | |
| "total_tokens": 29766723 | |
| }, | |
| { | |
| "epoch": 1.1627906976744187, | |
| "grad_norm": 0.19003146886825562, | |
| "learning_rate": 0.00013916051713908924, | |
| "loss": 0.5095, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6645, | |
| "step": 1550, | |
| "tokens_per_second_per_gpu": 4290.76, | |
| "total_tokens": 30218573 | |
| }, | |
| { | |
| "epoch": 1.1815453863465866, | |
| "grad_norm": 0.18279583752155304, | |
| "learning_rate": 0.00013729778069089437, | |
| "loss": 0.522, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6854, | |
| "step": 1575, | |
| "tokens_per_second_per_gpu": 4300.13, | |
| "total_tokens": 30669810 | |
| }, | |
| { | |
| "epoch": 1.2003000750187547, | |
| "grad_norm": 0.18783092498779297, | |
| "learning_rate": 0.00013541988735092672, | |
| "loss": 0.5003, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6492, | |
| "step": 1600, | |
| "tokens_per_second_per_gpu": 4271.27, | |
| "total_tokens": 31117586 | |
| }, | |
| { | |
| "epoch": 1.2190547636909228, | |
| "grad_norm": 0.199558824300766, | |
| "learning_rate": 0.00013352760024840175, | |
| "loss": 0.5115, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6678, | |
| "step": 1625, | |
| "tokens_per_second_per_gpu": 4248.14, | |
| "total_tokens": 31562224 | |
| }, | |
| { | |
| "epoch": 1.2378094523630907, | |
| "grad_norm": 0.19465653598308563, | |
| "learning_rate": 0.00013162168836180246, | |
| "loss": 0.4967, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6433, | |
| "step": 1650, | |
| "tokens_per_second_per_gpu": 4286.24, | |
| "total_tokens": 32011071 | |
| }, | |
| { | |
| "epoch": 1.2565641410352588, | |
| "grad_norm": 0.2054641842842102, | |
| "learning_rate": 0.00012970292620638574, | |
| "loss": 0.5172, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6773, | |
| "step": 1675, | |
| "tokens_per_second_per_gpu": 3733.1, | |
| "total_tokens": 32452490 | |
| }, | |
| { | |
| "epoch": 1.275318829707427, | |
| "grad_norm": 0.19450411200523376, | |
| "learning_rate": 0.00012777209351943862, | |
| "loss": 0.5149, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6735, | |
| "step": 1700, | |
| "tokens_per_second_per_gpu": 4251.33, | |
| "total_tokens": 32899103 | |
| }, | |
| { | |
| "epoch": 1.2940735183795948, | |
| "grad_norm": 0.19844166934490204, | |
| "learning_rate": 0.0001258299749434123, | |
| "loss": 0.5205, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6829, | |
| "step": 1725, | |
| "tokens_per_second_per_gpu": 4240.57, | |
| "total_tokens": 33344569 | |
| }, | |
| { | |
| "epoch": 1.312828207051763, | |
| "grad_norm": 0.19240470230579376, | |
| "learning_rate": 0.00012387735970706312, | |
| "loss": 0.5033, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6542, | |
| "step": 1750, | |
| "tokens_per_second_per_gpu": 4267.65, | |
| "total_tokens": 33790426 | |
| }, | |
| { | |
| "epoch": 1.331582895723931, | |
| "grad_norm": 0.18220192193984985, | |
| "learning_rate": 0.00012191504130472937, | |
| "loss": 0.5103, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6658, | |
| "step": 1775, | |
| "tokens_per_second_per_gpu": 4237.08, | |
| "total_tokens": 34233908 | |
| }, | |
| { | |
| "epoch": 1.350337584396099, | |
| "grad_norm": 0.20157551765441895, | |
| "learning_rate": 0.00011994381717387514, | |
| "loss": 0.5192, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6807, | |
| "step": 1800, | |
| "tokens_per_second_per_gpu": 4244.09, | |
| "total_tokens": 34678691 | |
| }, | |
| { | |
| "epoch": 1.369092273068267, | |
| "grad_norm": 0.17189238965511322, | |
| "learning_rate": 0.00011796448837103129, | |
| "loss": 0.5011, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6505, | |
| "step": 1825, | |
| "tokens_per_second_per_gpu": 4277.26, | |
| "total_tokens": 35125624 | |
| }, | |
| { | |
| "epoch": 1.387846961740435, | |
| "grad_norm": 0.19443106651306152, | |
| "learning_rate": 0.00011597785924626616, | |
| "loss": 0.4994, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6477, | |
| "step": 1850, | |
| "tokens_per_second_per_gpu": 3766.52, | |
| "total_tokens": 35568850 | |
| }, | |
| { | |
| "epoch": 1.406601650412603, | |
| "grad_norm": 0.1810811311006546, | |
| "learning_rate": 0.00011398473711631764, | |
| "loss": 0.5083, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6625, | |
| "step": 1875, | |
| "tokens_per_second_per_gpu": 4204.76, | |
| "total_tokens": 36009980 | |
| }, | |
| { | |
| "epoch": 1.4253563390847712, | |
| "grad_norm": 0.19805970788002014, | |
| "learning_rate": 0.00011198593193651958, | |
| "loss": 0.5141, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6721, | |
| "step": 1900, | |
| "tokens_per_second_per_gpu": 4270.21, | |
| "total_tokens": 36457032 | |
| }, | |
| { | |
| "epoch": 1.4441110277569393, | |
| "grad_norm": 0.1936168372631073, | |
| "learning_rate": 0.00010998225597165628, | |
| "loss": 0.5045, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6562, | |
| "step": 1925, | |
| "tokens_per_second_per_gpu": 4275.24, | |
| "total_tokens": 36905590 | |
| }, | |
| { | |
| "epoch": 1.4628657164291072, | |
| "grad_norm": 0.19065748155117035, | |
| "learning_rate": 0.00010797452346587798, | |
| "loss": 0.5025, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6528, | |
| "step": 1950, | |
| "tokens_per_second_per_gpu": 4285.81, | |
| "total_tokens": 37354436 | |
| }, | |
| { | |
| "epoch": 1.4816204051012754, | |
| "grad_norm": 0.18647657334804535, | |
| "learning_rate": 0.0001059635503118125, | |
| "loss": 0.5102, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6656, | |
| "step": 1975, | |
| "tokens_per_second_per_gpu": 4259.76, | |
| "total_tokens": 37801500 | |
| }, | |
| { | |
| "epoch": 1.5003750937734432, | |
| "grad_norm": 0.21211788058280945, | |
| "learning_rate": 0.00010395015371900663, | |
| "loss": 0.5052, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 25.53, | |
| "memory/max_allocated (GiB)": 25.53, | |
| "ppl": 1.6573, | |
| "step": 2000, | |
| "tokens_per_second_per_gpu": 4250.7, | |
| "total_tokens": 38244936 | |
| }, | |
| { | |
| "epoch": 1.5003750937734432, | |
| "eval_loss": 0.5063687562942505, | |
| "eval_ppl": 1.6593, | |
| "eval_runtime": 141.112, | |
| "eval_samples_per_second": 5.173, | |
| "eval_steps_per_second": 1.035, | |
| "memory/device_reserved (GiB)": 139.06, | |
| "memory/max_active (GiB)": 19.1, | |
| "memory/max_allocated (GiB)": 19.1, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 3996, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.62796004179968e+18, | |
| "train_batch_size": 5, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |