| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.100806451612903, |
| "eval_steps": 500, |
| "global_step": 405, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0020161290322580645, |
| "grad_norm": 14.229155540466309, |
| "learning_rate": 0.0002, |
| "loss": 2.857408046722412, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 17.41633, |
| "step": 1, |
| "tokens/total": 2048, |
| "tokens/train_per_sec_per_gpu": 20.25, |
| "tokens/trainable": 2039 |
| }, |
| { |
| "epoch": 0.004032258064516129, |
| "grad_norm": 0.4178522527217865, |
| "learning_rate": 0.00019999799412001546, |
| "loss": 2.866978645324707, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 17.58381, |
| "step": 2, |
| "tokens/total": 4096, |
| "tokens/train_per_sec_per_gpu": 262.4, |
| "tokens/trainable": 4078 |
| }, |
| { |
| "epoch": 0.006048387096774193, |
| "grad_norm": 0.42466941475868225, |
| "learning_rate": 0.00019999197656053288, |
| "loss": 2.8624894618988037, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 17.50505, |
| "step": 3, |
| "tokens/total": 6144, |
| "tokens/train_per_sec_per_gpu": 261.67, |
| "tokens/trainable": 6123 |
| }, |
| { |
| "epoch": 0.008064516129032258, |
| "grad_norm": 10.047090530395508, |
| "learning_rate": 0.0001999819475629623, |
| "loss": 2.7082412242889404, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 15.00287, |
| "step": 4, |
| "tokens/total": 8192, |
| "tokens/train_per_sec_per_gpu": 264.34, |
| "tokens/trainable": 8161 |
| }, |
| { |
| "epoch": 0.010080645161290322, |
| "grad_norm": 0.42684099078178406, |
| "learning_rate": 0.00019996790752964305, |
| "loss": 2.5221738815307617, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 12.45564, |
| "step": 5, |
| "tokens/total": 10240, |
| "tokens/train_per_sec_per_gpu": 257.89, |
| "tokens/trainable": 10195 |
| }, |
| { |
| "epoch": 0.012096774193548387, |
| "grad_norm": 0.552547812461853, |
| "learning_rate": 0.00019994985702382758, |
| "loss": 3.238849639892578, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 25.50437, |
| "step": 6, |
| "tokens/total": 12288, |
| "tokens/train_per_sec_per_gpu": 252.93, |
| "tokens/trainable": 12232 |
| }, |
| { |
| "epoch": 0.014112903225806451, |
| "grad_norm": 0.5256035327911377, |
| "learning_rate": 0.00019992779676965885, |
| "loss": 2.7185559272766113, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 15.15842, |
| "step": 7, |
| "tokens/total": 14336, |
| "tokens/train_per_sec_per_gpu": 263.26, |
| "tokens/trainable": 14266 |
| }, |
| { |
| "epoch": 0.016129032258064516, |
| "grad_norm": 0.5142767429351807, |
| "learning_rate": 0.00019990172765214128, |
| "loss": 2.322587728500366, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 10.20204, |
| "step": 8, |
| "tokens/total": 16384, |
| "tokens/train_per_sec_per_gpu": 256.27, |
| "tokens/trainable": 16312 |
| }, |
| { |
| "epoch": 0.018145161290322582, |
| "grad_norm": 0.6270139813423157, |
| "learning_rate": 0.00019987165071710527, |
| "loss": 2.8085548877716064, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 16.58593, |
| "step": 9, |
| "tokens/total": 18432, |
| "tokens/train_per_sec_per_gpu": 244.86, |
| "tokens/trainable": 18348 |
| }, |
| { |
| "epoch": 0.020161290322580645, |
| "grad_norm": 0.5775133371353149, |
| "learning_rate": 0.00019983756717116536, |
| "loss": 2.1894941329956055, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.93069, |
| "step": 10, |
| "tokens/total": 20480, |
| "tokens/train_per_sec_per_gpu": 255.48, |
| "tokens/trainable": 20385 |
| }, |
| { |
| "epoch": 0.02217741935483871, |
| "grad_norm": 0.7627199292182922, |
| "learning_rate": 0.0001997994783816715, |
| "loss": 2.6888070106506348, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 14.71411, |
| "step": 11, |
| "tokens/total": 22528, |
| "tokens/train_per_sec_per_gpu": 262.76, |
| "tokens/trainable": 22428 |
| }, |
| { |
| "epoch": 0.024193548387096774, |
| "grad_norm": 0.7487443685531616, |
| "learning_rate": 0.00019975738587665456, |
| "loss": 3.052485942840576, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 21.1679, |
| "step": 12, |
| "tokens/total": 24576, |
| "tokens/train_per_sec_per_gpu": 258.22, |
| "tokens/trainable": 24458 |
| }, |
| { |
| "epoch": 0.02620967741935484, |
| "grad_norm": 0.783439040184021, |
| "learning_rate": 0.00019971129134476473, |
| "loss": 2.723402500152588, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 15.23206, |
| "step": 13, |
| "tokens/total": 26624, |
| "tokens/train_per_sec_per_gpu": 260.81, |
| "tokens/trainable": 26498 |
| }, |
| { |
| "epoch": 0.028225806451612902, |
| "grad_norm": 0.8359043002128601, |
| "learning_rate": 0.00019966119663520412, |
| "loss": 2.5634570121765137, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 12.98061, |
| "step": 14, |
| "tokens/total": 28672, |
| "tokens/train_per_sec_per_gpu": 257.31, |
| "tokens/trainable": 28538 |
| }, |
| { |
| "epoch": 0.03024193548387097, |
| "grad_norm": 0.6595765352249146, |
| "learning_rate": 0.0001996071037576521, |
| "loss": 2.629350423812866, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 13.86476, |
| "step": 15, |
| "tokens/total": 30720, |
| "tokens/train_per_sec_per_gpu": 247.77, |
| "tokens/trainable": 30584 |
| }, |
| { |
| "epoch": 0.03225806451612903, |
| "grad_norm": 0.611452043056488, |
| "learning_rate": 0.00019954901488218515, |
| "loss": 2.4734597206115723, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 11.86342, |
| "step": 16, |
| "tokens/total": 32768, |
| "tokens/train_per_sec_per_gpu": 259.46, |
| "tokens/trainable": 32623 |
| }, |
| { |
| "epoch": 0.034274193548387094, |
| "grad_norm": 0.5893421769142151, |
| "learning_rate": 0.00019948693233918952, |
| "loss": 2.2999792098999023, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.97398, |
| "step": 17, |
| "tokens/total": 34816, |
| "tokens/train_per_sec_per_gpu": 256.75, |
| "tokens/trainable": 34652 |
| }, |
| { |
| "epoch": 0.036290322580645164, |
| "grad_norm": 0.6046351790428162, |
| "learning_rate": 0.0001994208586192678, |
| "loss": 2.5167291164398193, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 12.38801, |
| "step": 18, |
| "tokens/total": 36864, |
| "tokens/train_per_sec_per_gpu": 250.14, |
| "tokens/trainable": 36691 |
| }, |
| { |
| "epoch": 0.038306451612903226, |
| "grad_norm": 0.6316596865653992, |
| "learning_rate": 0.00019935079637313906, |
| "loss": 2.5756912231445312, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 13.1404, |
| "step": 19, |
| "tokens/total": 38912, |
| "tokens/train_per_sec_per_gpu": 249.87, |
| "tokens/trainable": 38729 |
| }, |
| { |
| "epoch": 0.04032258064516129, |
| "grad_norm": 0.6076182126998901, |
| "learning_rate": 0.00019927674841153237, |
| "loss": 2.090085983276367, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.08561, |
| "step": 20, |
| "tokens/total": 40960, |
| "tokens/train_per_sec_per_gpu": 249.2, |
| "tokens/trainable": 40773 |
| }, |
| { |
| "epoch": 0.04233870967741935, |
| "grad_norm": 0.618816614151001, |
| "learning_rate": 0.0001991987177050743, |
| "loss": 2.6480720043182373, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 14.12678, |
| "step": 21, |
| "tokens/total": 43008, |
| "tokens/train_per_sec_per_gpu": 255.22, |
| "tokens/trainable": 42805 |
| }, |
| { |
| "epoch": 0.04435483870967742, |
| "grad_norm": 0.6039671301841736, |
| "learning_rate": 0.00019911670738416947, |
| "loss": 1.99757719039917, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.37118, |
| "step": 22, |
| "tokens/total": 45056, |
| "tokens/train_per_sec_per_gpu": 259.8, |
| "tokens/trainable": 44849 |
| }, |
| { |
| "epoch": 0.046370967741935484, |
| "grad_norm": 0.5967584252357483, |
| "learning_rate": 0.00019903072073887507, |
| "loss": 2.346921443939209, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 10.45334, |
| "step": 23, |
| "tokens/total": 47104, |
| "tokens/train_per_sec_per_gpu": 261.54, |
| "tokens/trainable": 46878 |
| }, |
| { |
| "epoch": 0.04838709677419355, |
| "grad_norm": 0.5980664491653442, |
| "learning_rate": 0.000198940761218769, |
| "loss": 2.123394012451172, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.35946, |
| "step": 24, |
| "tokens/total": 49152, |
| "tokens/train_per_sec_per_gpu": 261.87, |
| "tokens/trainable": 48920 |
| }, |
| { |
| "epoch": 0.05040322580645161, |
| "grad_norm": 0.612209677696228, |
| "learning_rate": 0.00019884683243281116, |
| "loss": 2.1959524154663086, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.98856, |
| "step": 25, |
| "tokens/total": 51200, |
| "tokens/train_per_sec_per_gpu": 254.83, |
| "tokens/trainable": 50960 |
| }, |
| { |
| "epoch": 0.05241935483870968, |
| "grad_norm": 0.6679723858833313, |
| "learning_rate": 0.00019874893814919906, |
| "loss": 2.667539119720459, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 14.40448, |
| "step": 26, |
| "tokens/total": 53248, |
| "tokens/train_per_sec_per_gpu": 245.17, |
| "tokens/trainable": 52995 |
| }, |
| { |
| "epoch": 0.05443548387096774, |
| "grad_norm": 0.6861585378646851, |
| "learning_rate": 0.00019864708229521636, |
| "loss": 1.9803462028503418, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.24525, |
| "step": 27, |
| "tokens/total": 55296, |
| "tokens/train_per_sec_per_gpu": 258.01, |
| "tokens/trainable": 55034 |
| }, |
| { |
| "epoch": 0.056451612903225805, |
| "grad_norm": 0.5645394921302795, |
| "learning_rate": 0.0001985412689570754, |
| "loss": 1.961925745010376, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.11301, |
| "step": 28, |
| "tokens/total": 57344, |
| "tokens/train_per_sec_per_gpu": 258.84, |
| "tokens/trainable": 57050 |
| }, |
| { |
| "epoch": 0.05846774193548387, |
| "grad_norm": 0.6146724224090576, |
| "learning_rate": 0.00019843150237975344, |
| "loss": 2.7785215377807617, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 16.09521, |
| "step": 29, |
| "tokens/total": 59392, |
| "tokens/train_per_sec_per_gpu": 262.26, |
| "tokens/trainable": 59084 |
| }, |
| { |
| "epoch": 0.06048387096774194, |
| "grad_norm": 0.5862687230110168, |
| "learning_rate": 0.00019831778696682194, |
| "loss": 1.864811897277832, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.45472, |
| "step": 30, |
| "tokens/total": 61440, |
| "tokens/train_per_sec_per_gpu": 262.16, |
| "tokens/trainable": 61121 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 0.6342179775238037, |
| "learning_rate": 0.00019820012728027044, |
| "loss": 2.477421283721924, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 11.91051, |
| "step": 31, |
| "tokens/total": 63488, |
| "tokens/train_per_sec_per_gpu": 264.65, |
| "tokens/trainable": 63166 |
| }, |
| { |
| "epoch": 0.06451612903225806, |
| "grad_norm": 0.5973448157310486, |
| "learning_rate": 0.00019807852804032305, |
| "loss": 2.2537577152252197, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.52346, |
| "step": 32, |
| "tokens/total": 65536, |
| "tokens/train_per_sec_per_gpu": 249.19, |
| "tokens/trainable": 65203 |
| }, |
| { |
| "epoch": 0.06653225806451613, |
| "grad_norm": 0.5755914449691772, |
| "learning_rate": 0.00019795299412524945, |
| "loss": 2.242089033126831, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.41297, |
| "step": 33, |
| "tokens/total": 67584, |
| "tokens/train_per_sec_per_gpu": 257.52, |
| "tokens/trainable": 67227 |
| }, |
| { |
| "epoch": 0.06854838709677419, |
| "grad_norm": 0.6142029166221619, |
| "learning_rate": 0.000197823530571169, |
| "loss": 2.0939087867736816, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.11658, |
| "step": 34, |
| "tokens/total": 69632, |
| "tokens/train_per_sec_per_gpu": 258.27, |
| "tokens/trainable": 69269 |
| }, |
| { |
| "epoch": 0.07056451612903226, |
| "grad_norm": 0.6227375268936157, |
| "learning_rate": 0.0001976901425718487, |
| "loss": 2.35734224319458, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 10.56284, |
| "step": 35, |
| "tokens/total": 71680, |
| "tokens/train_per_sec_per_gpu": 260.78, |
| "tokens/trainable": 71312 |
| }, |
| { |
| "epoch": 0.07258064516129033, |
| "grad_norm": 0.5991148948669434, |
| "learning_rate": 0.00019755283547849494, |
| "loss": 2.319620370864868, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 10.17181, |
| "step": 36, |
| "tokens/total": 73728, |
| "tokens/train_per_sec_per_gpu": 259.96, |
| "tokens/trainable": 73349 |
| }, |
| { |
| "epoch": 0.07459677419354839, |
| "grad_norm": 0.5688547492027283, |
| "learning_rate": 0.0001974116147995387, |
| "loss": 1.8059585094451904, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.0858, |
| "step": 37, |
| "tokens/total": 75776, |
| "tokens/train_per_sec_per_gpu": 262.3, |
| "tokens/trainable": 75395 |
| }, |
| { |
| "epoch": 0.07661290322580645, |
| "grad_norm": 0.7161643505096436, |
| "learning_rate": 0.00019726648620041468, |
| "loss": 2.224153518676758, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.24565, |
| "step": 38, |
| "tokens/total": 77824, |
| "tokens/train_per_sec_per_gpu": 255.46, |
| "tokens/trainable": 77427 |
| }, |
| { |
| "epoch": 0.07862903225806452, |
| "grad_norm": 0.6673325300216675, |
| "learning_rate": 0.0001971174555033339, |
| "loss": 2.3063974380493164, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 10.0382, |
| "step": 39, |
| "tokens/total": 79872, |
| "tokens/train_per_sec_per_gpu": 254.47, |
| "tokens/trainable": 79459 |
| }, |
| { |
| "epoch": 0.08064516129032258, |
| "grad_norm": 0.6383804678916931, |
| "learning_rate": 0.00019696452868705024, |
| "loss": 2.597661018371582, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 13.43228, |
| "step": 40, |
| "tokens/total": 81920, |
| "tokens/train_per_sec_per_gpu": 256.24, |
| "tokens/trainable": 81506 |
| }, |
| { |
| "epoch": 0.08266129032258064, |
| "grad_norm": 0.7061684727668762, |
| "learning_rate": 0.00019680771188662044, |
| "loss": 1.7441378831863403, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.72097, |
| "step": 41, |
| "tokens/total": 83968, |
| "tokens/train_per_sec_per_gpu": 242.74, |
| "tokens/trainable": 83536 |
| }, |
| { |
| "epoch": 0.0846774193548387, |
| "grad_norm": 0.7068732380867004, |
| "learning_rate": 0.0001966470113931582, |
| "loss": 2.949695587158203, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 19.10014, |
| "step": 42, |
| "tokens/total": 86016, |
| "tokens/train_per_sec_per_gpu": 252.81, |
| "tokens/trainable": 85581 |
| }, |
| { |
| "epoch": 0.08669354838709678, |
| "grad_norm": 0.5967429876327515, |
| "learning_rate": 0.00019648243365358146, |
| "loss": 2.2845401763916016, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.82117, |
| "step": 43, |
| "tokens/total": 88064, |
| "tokens/train_per_sec_per_gpu": 256.36, |
| "tokens/trainable": 87621 |
| }, |
| { |
| "epoch": 0.08870967741935484, |
| "grad_norm": 0.6582663059234619, |
| "learning_rate": 0.00019631398527035422, |
| "loss": 2.4249186515808105, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 11.30131, |
| "step": 44, |
| "tokens/total": 90112, |
| "tokens/train_per_sec_per_gpu": 262.03, |
| "tokens/trainable": 89666 |
| }, |
| { |
| "epoch": 0.0907258064516129, |
| "grad_norm": 0.6329452395439148, |
| "learning_rate": 0.00019614167300122126, |
| "loss": 2.4295296669006348, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 11.35354, |
| "step": 45, |
| "tokens/total": 92160, |
| "tokens/train_per_sec_per_gpu": 255.81, |
| "tokens/trainable": 91706 |
| }, |
| { |
| "epoch": 0.09274193548387097, |
| "grad_norm": 0.6765903830528259, |
| "learning_rate": 0.0001959655037589372, |
| "loss": 2.2950925827026367, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.92535, |
| "step": 46, |
| "tokens/total": 94208, |
| "tokens/train_per_sec_per_gpu": 32.48, |
| "tokens/trainable": 93749 |
| }, |
| { |
| "epoch": 0.09475806451612903, |
| "grad_norm": 0.795857310295105, |
| "learning_rate": 0.00019578548461098914, |
| "loss": 1.9531352519989014, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.05076, |
| "step": 47, |
| "tokens/total": 96256, |
| "tokens/train_per_sec_per_gpu": 134.2, |
| "tokens/trainable": 95772 |
| }, |
| { |
| "epoch": 0.0967741935483871, |
| "grad_norm": 0.5967952013015747, |
| "learning_rate": 0.00019560162277931325, |
| "loss": 1.7950658798217773, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.01987, |
| "step": 48, |
| "tokens/total": 98304, |
| "tokens/train_per_sec_per_gpu": 133.62, |
| "tokens/trainable": 97793 |
| }, |
| { |
| "epoch": 0.09879032258064516, |
| "grad_norm": 0.6623239517211914, |
| "learning_rate": 0.00019541392564000488, |
| "loss": 2.242034912109375, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.41247, |
| "step": 49, |
| "tokens/total": 100352, |
| "tokens/train_per_sec_per_gpu": 133.56, |
| "tokens/trainable": 99828 |
| }, |
| { |
| "epoch": 0.10080645161290322, |
| "grad_norm": 0.6411992907524109, |
| "learning_rate": 0.00019522240072302274, |
| "loss": 1.8567615747451782, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.40297, |
| "step": 50, |
| "tokens/total": 102400, |
| "tokens/train_per_sec_per_gpu": 136.64, |
| "tokens/trainable": 101859 |
| }, |
| { |
| "epoch": 0.1028225806451613, |
| "grad_norm": 0.6743437647819519, |
| "learning_rate": 0.00019502705571188672, |
| "loss": 1.7463788986206055, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.7338, |
| "step": 51, |
| "tokens/total": 104448, |
| "tokens/train_per_sec_per_gpu": 134.46, |
| "tokens/trainable": 103902 |
| }, |
| { |
| "epoch": 0.10483870967741936, |
| "grad_norm": 0.5422692894935608, |
| "learning_rate": 0.0001948278984433699, |
| "loss": 1.7116990089416504, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.53836, |
| "step": 52, |
| "tokens/total": 106496, |
| "tokens/train_per_sec_per_gpu": 135.55, |
| "tokens/trainable": 105941 |
| }, |
| { |
| "epoch": 0.10685483870967742, |
| "grad_norm": 0.5959545969963074, |
| "learning_rate": 0.0001946249369071837, |
| "loss": 2.0541088581085205, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.79988, |
| "step": 53, |
| "tokens/total": 108544, |
| "tokens/train_per_sec_per_gpu": 136.22, |
| "tokens/trainable": 107963 |
| }, |
| { |
| "epoch": 0.10887096774193548, |
| "grad_norm": 0.7033765316009521, |
| "learning_rate": 0.00019441817924565786, |
| "loss": 2.3317766189575195, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 10.29622, |
| "step": 54, |
| "tokens/total": 110592, |
| "tokens/train_per_sec_per_gpu": 138.09, |
| "tokens/trainable": 109990 |
| }, |
| { |
| "epoch": 0.11088709677419355, |
| "grad_norm": 0.8628013134002686, |
| "learning_rate": 0.0001942076337534135, |
| "loss": 2.250319004058838, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.49076, |
| "step": 55, |
| "tokens/total": 112640, |
| "tokens/train_per_sec_per_gpu": 133.73, |
| "tokens/trainable": 112020 |
| }, |
| { |
| "epoch": 0.11290322580645161, |
| "grad_norm": 0.6559049487113953, |
| "learning_rate": 0.00019399330887703037, |
| "loss": 2.2744903564453125, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.72296, |
| "step": 56, |
| "tokens/total": 114688, |
| "tokens/train_per_sec_per_gpu": 134.26, |
| "tokens/trainable": 114066 |
| }, |
| { |
| "epoch": 0.11491935483870967, |
| "grad_norm": 0.6887659430503845, |
| "learning_rate": 0.00019377521321470805, |
| "loss": 2.313232898712158, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 10.10705, |
| "step": 57, |
| "tokens/total": 116736, |
| "tokens/train_per_sec_per_gpu": 133.71, |
| "tokens/trainable": 116088 |
| }, |
| { |
| "epoch": 0.11693548387096774, |
| "grad_norm": 0.6893835663795471, |
| "learning_rate": 0.00019355335551592105, |
| "loss": 2.245192289352417, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.44223, |
| "step": 58, |
| "tokens/total": 118784, |
| "tokens/train_per_sec_per_gpu": 133.19, |
| "tokens/trainable": 118109 |
| }, |
| { |
| "epoch": 0.11895161290322581, |
| "grad_norm": 0.7315247654914856, |
| "learning_rate": 0.00019332774468106768, |
| "loss": 2.1427183151245117, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.52257, |
| "step": 59, |
| "tokens/total": 120832, |
| "tokens/train_per_sec_per_gpu": 136.05, |
| "tokens/trainable": 120155 |
| }, |
| { |
| "epoch": 0.12096774193548387, |
| "grad_norm": 0.6021593809127808, |
| "learning_rate": 0.00019309838976111311, |
| "loss": 2.017518997192383, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.51965, |
| "step": 60, |
| "tokens/total": 122880, |
| "tokens/train_per_sec_per_gpu": 134.7, |
| "tokens/trainable": 122178 |
| }, |
| { |
| "epoch": 0.12298387096774194, |
| "grad_norm": 0.6357892751693726, |
| "learning_rate": 0.00019286529995722623, |
| "loss": 1.8628515005111694, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.44208, |
| "step": 61, |
| "tokens/total": 124928, |
| "tokens/train_per_sec_per_gpu": 133.45, |
| "tokens/trainable": 124202 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 0.6388751864433289, |
| "learning_rate": 0.00019262848462041045, |
| "loss": 1.9663957357406616, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.14488, |
| "step": 62, |
| "tokens/total": 126976, |
| "tokens/train_per_sec_per_gpu": 131.0, |
| "tokens/trainable": 126219 |
| }, |
| { |
| "epoch": 0.12701612903225806, |
| "grad_norm": 0.6599971652030945, |
| "learning_rate": 0.0001923879532511287, |
| "loss": 2.0729598999023438, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.94831, |
| "step": 63, |
| "tokens/total": 129024, |
| "tokens/train_per_sec_per_gpu": 135.7, |
| "tokens/trainable": 128262 |
| }, |
| { |
| "epoch": 0.12903225806451613, |
| "grad_norm": 0.7960649728775024, |
| "learning_rate": 0.0001921437154989221, |
| "loss": 2.5976061820983887, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 13.43155, |
| "step": 64, |
| "tokens/total": 131072, |
| "tokens/train_per_sec_per_gpu": 136.99, |
| "tokens/trainable": 130300 |
| }, |
| { |
| "epoch": 0.1310483870967742, |
| "grad_norm": 0.6315323710441589, |
| "learning_rate": 0.00019189578116202307, |
| "loss": 2.0962467193603516, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.13558, |
| "step": 65, |
| "tokens/total": 133120, |
| "tokens/train_per_sec_per_gpu": 137.36, |
| "tokens/trainable": 132335 |
| }, |
| { |
| "epoch": 0.13306451612903225, |
| "grad_norm": 0.6805335879325867, |
| "learning_rate": 0.00019164416018696207, |
| "loss": 2.0736522674560547, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.95382, |
| "step": 66, |
| "tokens/total": 135168, |
| "tokens/train_per_sec_per_gpu": 137.33, |
| "tokens/trainable": 134340 |
| }, |
| { |
| "epoch": 0.1350806451612903, |
| "grad_norm": 0.635079562664032, |
| "learning_rate": 0.00019138886266816866, |
| "loss": 2.0142641067504883, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.49521, |
| "step": 67, |
| "tokens/total": 137216, |
| "tokens/train_per_sec_per_gpu": 133.68, |
| "tokens/trainable": 136365 |
| }, |
| { |
| "epoch": 0.13709677419354838, |
| "grad_norm": 0.6177524924278259, |
| "learning_rate": 0.00019112989884756653, |
| "loss": 1.8478095531463623, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.3459, |
| "step": 68, |
| "tokens/total": 139264, |
| "tokens/train_per_sec_per_gpu": 136.24, |
| "tokens/trainable": 138391 |
| }, |
| { |
| "epoch": 0.13911290322580644, |
| "grad_norm": 0.6538437604904175, |
| "learning_rate": 0.0001908672791141625, |
| "loss": 2.0607097148895264, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.85154, |
| "step": 69, |
| "tokens/total": 141312, |
| "tokens/train_per_sec_per_gpu": 138.96, |
| "tokens/trainable": 140435 |
| }, |
| { |
| "epoch": 0.14112903225806453, |
| "grad_norm": 0.6102776527404785, |
| "learning_rate": 0.00019060101400362998, |
| "loss": 1.8468718528747559, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.33996, |
| "step": 70, |
| "tokens/total": 143360, |
| "tokens/train_per_sec_per_gpu": 137.72, |
| "tokens/trainable": 142463 |
| }, |
| { |
| "epoch": 0.1431451612903226, |
| "grad_norm": 0.6988463997840881, |
| "learning_rate": 0.00019033111419788597, |
| "loss": 2.087909460067749, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.06803, |
| "step": 71, |
| "tokens/total": 145408, |
| "tokens/train_per_sec_per_gpu": 120.7, |
| "tokens/trainable": 144290 |
| }, |
| { |
| "epoch": 1.002016129032258, |
| "grad_norm": 0.5984178781509399, |
| "learning_rate": 0.000190057590524663, |
| "loss": 1.9351210594177246, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.92488, |
| "step": 72, |
| "tokens/total": 147456, |
| "tokens/train_per_sec_per_gpu": 134.47, |
| "tokens/trainable": 146323 |
| }, |
| { |
| "epoch": 1.0040322580645162, |
| "grad_norm": 0.6225078701972961, |
| "learning_rate": 0.00018978045395707418, |
| "loss": 1.8930723667144775, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.63974, |
| "step": 73, |
| "tokens/total": 149504, |
| "tokens/train_per_sec_per_gpu": 138.22, |
| "tokens/trainable": 148360 |
| }, |
| { |
| "epoch": 1.0060483870967742, |
| "grad_norm": 0.6837508082389832, |
| "learning_rate": 0.0001894997156131734, |
| "loss": 1.9495553970336914, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.02556, |
| "step": 74, |
| "tokens/total": 151552, |
| "tokens/train_per_sec_per_gpu": 137.22, |
| "tokens/trainable": 150403 |
| }, |
| { |
| "epoch": 1.0080645161290323, |
| "grad_norm": 0.6232768893241882, |
| "learning_rate": 0.0001892153867555092, |
| "loss": 1.8917932510375977, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.63125, |
| "step": 75, |
| "tokens/total": 153600, |
| "tokens/train_per_sec_per_gpu": 137.44, |
| "tokens/trainable": 152441 |
| }, |
| { |
| "epoch": 1.0100806451612903, |
| "grad_norm": 0.664825975894928, |
| "learning_rate": 0.00018892747879067286, |
| "loss": 1.785915732383728, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.96504, |
| "step": 76, |
| "tokens/total": 155648, |
| "tokens/train_per_sec_per_gpu": 136.24, |
| "tokens/trainable": 154488 |
| }, |
| { |
| "epoch": 1.0120967741935485, |
| "grad_norm": 0.6600914597511292, |
| "learning_rate": 0.00018863600326884082, |
| "loss": 1.8432128429412842, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.3168, |
| "step": 77, |
| "tokens/total": 157696, |
| "tokens/train_per_sec_per_gpu": 133.58, |
| "tokens/trainable": 156510 |
| }, |
| { |
| "epoch": 1.0141129032258065, |
| "grad_norm": 0.7409534454345703, |
| "learning_rate": 0.00018834097188331143, |
| "loss": 2.134878158569336, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.45602, |
| "step": 78, |
| "tokens/total": 159744, |
| "tokens/train_per_sec_per_gpu": 136.39, |
| "tokens/trainable": 158551 |
| }, |
| { |
| "epoch": 1.0161290322580645, |
| "grad_norm": 0.6784939765930176, |
| "learning_rate": 0.00018804239647003573, |
| "loss": 1.950951099395752, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.03538, |
| "step": 79, |
| "tokens/total": 161792, |
| "tokens/train_per_sec_per_gpu": 133.83, |
| "tokens/trainable": 160585 |
| }, |
| { |
| "epoch": 1.0181451612903225, |
| "grad_norm": 0.6995214819908142, |
| "learning_rate": 0.00018774028900714256, |
| "loss": 2.291863441467285, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.89336, |
| "step": 80, |
| "tokens/total": 163840, |
| "tokens/train_per_sec_per_gpu": 135.69, |
| "tokens/trainable": 162622 |
| }, |
| { |
| "epoch": 1.0201612903225807, |
| "grad_norm": 0.6955534219741821, |
| "learning_rate": 0.00018743466161445823, |
| "loss": 1.7035590410232544, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.49346, |
| "step": 81, |
| "tokens/total": 165888, |
| "tokens/train_per_sec_per_gpu": 133.41, |
| "tokens/trainable": 164663 |
| }, |
| { |
| "epoch": 1.0221774193548387, |
| "grad_norm": 0.6910862326622009, |
| "learning_rate": 0.0001871255265530201, |
| "loss": 2.1356892585754395, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.46288, |
| "step": 82, |
| "tokens/total": 167936, |
| "tokens/train_per_sec_per_gpu": 133.66, |
| "tokens/trainable": 166700 |
| }, |
| { |
| "epoch": 1.0241935483870968, |
| "grad_norm": 0.7269375324249268, |
| "learning_rate": 0.00018681289622458485, |
| "loss": 2.05245304107666, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.78698, |
| "step": 83, |
| "tokens/total": 169984, |
| "tokens/train_per_sec_per_gpu": 133.27, |
| "tokens/trainable": 168743 |
| }, |
| { |
| "epoch": 1.0262096774193548, |
| "grad_norm": 0.7181906700134277, |
| "learning_rate": 0.00018649678317113084, |
| "loss": 1.815029263496399, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.14126, |
| "step": 84, |
| "tokens/total": 172032, |
| "tokens/train_per_sec_per_gpu": 133.71, |
| "tokens/trainable": 170779 |
| }, |
| { |
| "epoch": 1.028225806451613, |
| "grad_norm": 0.7987785935401917, |
| "learning_rate": 0.00018617720007435497, |
| "loss": 2.4782207012176514, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 11.92004, |
| "step": 85, |
| "tokens/total": 174080, |
| "tokens/train_per_sec_per_gpu": 134.46, |
| "tokens/trainable": 172825 |
| }, |
| { |
| "epoch": 1.030241935483871, |
| "grad_norm": 0.7619947195053101, |
| "learning_rate": 0.000185854159755164, |
| "loss": 2.177952289581299, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.82821, |
| "step": 86, |
| "tokens/total": 176128, |
| "tokens/train_per_sec_per_gpu": 131.89, |
| "tokens/trainable": 174861 |
| }, |
| { |
| "epoch": 1.032258064516129, |
| "grad_norm": 0.6998146772384644, |
| "learning_rate": 0.00018552767517316022, |
| "loss": 1.8050026893615723, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.07999, |
| "step": 87, |
| "tokens/total": 178176, |
| "tokens/train_per_sec_per_gpu": 133.17, |
| "tokens/trainable": 176894 |
| }, |
| { |
| "epoch": 1.034274193548387, |
| "grad_norm": 0.7544717788696289, |
| "learning_rate": 0.00018519775942612128, |
| "loss": 2.0845391750335693, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.04089, |
| "step": 88, |
| "tokens/total": 180224, |
| "tokens/train_per_sec_per_gpu": 132.51, |
| "tokens/trainable": 178924 |
| }, |
| { |
| "epoch": 1.0362903225806452, |
| "grad_norm": 0.7606767416000366, |
| "learning_rate": 0.00018486442574947511, |
| "loss": 2.1034138202667236, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.1941, |
| "step": 89, |
| "tokens/total": 182272, |
| "tokens/train_per_sec_per_gpu": 134.29, |
| "tokens/trainable": 180969 |
| }, |
| { |
| "epoch": 1.0383064516129032, |
| "grad_norm": 0.7183067798614502, |
| "learning_rate": 0.0001845276875157687, |
| "loss": 2.0029361248016357, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.41078, |
| "step": 90, |
| "tokens/total": 184320, |
| "tokens/train_per_sec_per_gpu": 132.23, |
| "tokens/trainable": 183002 |
| }, |
| { |
| "epoch": 1.0403225806451613, |
| "grad_norm": 0.8227747082710266, |
| "learning_rate": 0.0001841875582341317, |
| "loss": 2.2535457611083984, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.52144, |
| "step": 91, |
| "tokens/total": 186368, |
| "tokens/train_per_sec_per_gpu": 130.72, |
| "tokens/trainable": 185040 |
| }, |
| { |
| "epoch": 1.0423387096774193, |
| "grad_norm": 0.8300987482070923, |
| "learning_rate": 0.0001838440515497345, |
| "loss": 2.2482643127441406, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.47128, |
| "step": 92, |
| "tokens/total": 188416, |
| "tokens/train_per_sec_per_gpu": 133.43, |
| "tokens/trainable": 187068 |
| }, |
| { |
| "epoch": 1.0443548387096775, |
| "grad_norm": 0.7577045559883118, |
| "learning_rate": 0.00018349718124324076, |
| "loss": 2.021200656890869, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.54738, |
| "step": 93, |
| "tokens/total": 190464, |
| "tokens/train_per_sec_per_gpu": 133.41, |
| "tokens/trainable": 189096 |
| }, |
| { |
| "epoch": 1.0463709677419355, |
| "grad_norm": 0.7605635523796082, |
| "learning_rate": 0.00018314696123025454, |
| "loss": 1.8970659971237183, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.66631, |
| "step": 94, |
| "tokens/total": 192512, |
| "tokens/train_per_sec_per_gpu": 133.88, |
| "tokens/trainable": 191131 |
| }, |
| { |
| "epoch": 1.0483870967741935, |
| "grad_norm": 0.7875813245773315, |
| "learning_rate": 0.00018279340556076216, |
| "loss": 1.7688566446304321, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.86414, |
| "step": 95, |
| "tokens/total": 194560, |
| "tokens/train_per_sec_per_gpu": 131.53, |
| "tokens/trainable": 193158 |
| }, |
| { |
| "epoch": 1.0504032258064515, |
| "grad_norm": 0.7767308354377747, |
| "learning_rate": 0.0001824365284185684, |
| "loss": 1.7849911451339722, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.95953, |
| "step": 96, |
| "tokens/total": 196608, |
| "tokens/train_per_sec_per_gpu": 132.93, |
| "tokens/trainable": 195195 |
| }, |
| { |
| "epoch": 1.0524193548387097, |
| "grad_norm": 0.7135453224182129, |
| "learning_rate": 0.00018207634412072764, |
| "loss": 1.6029993295669556, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.96791, |
| "step": 97, |
| "tokens/total": 198656, |
| "tokens/train_per_sec_per_gpu": 134.14, |
| "tokens/trainable": 197231 |
| }, |
| { |
| "epoch": 1.0544354838709677, |
| "grad_norm": 0.7890044450759888, |
| "learning_rate": 0.00018171286711696934, |
| "loss": 2.068784236907959, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.91519, |
| "step": 98, |
| "tokens/total": 200704, |
| "tokens/train_per_sec_per_gpu": 132.51, |
| "tokens/trainable": 199274 |
| }, |
| { |
| "epoch": 1.0564516129032258, |
| "grad_norm": 0.7956987023353577, |
| "learning_rate": 0.0001813461119891184, |
| "loss": 1.9784281253814697, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.23137, |
| "step": 99, |
| "tokens/total": 202752, |
| "tokens/train_per_sec_per_gpu": 135.46, |
| "tokens/trainable": 201310 |
| }, |
| { |
| "epoch": 1.0584677419354838, |
| "grad_norm": 0.7803149223327637, |
| "learning_rate": 0.00018097609345051025, |
| "loss": 1.885071039199829, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.58682, |
| "step": 100, |
| "tokens/total": 204800, |
| "tokens/train_per_sec_per_gpu": 133.55, |
| "tokens/trainable": 203342 |
| }, |
| { |
| "epoch": 1.060483870967742, |
| "grad_norm": 0.8120332956314087, |
| "learning_rate": 0.00018060282634540053, |
| "loss": 2.0599560737609863, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.84563, |
| "step": 101, |
| "tokens/total": 206848, |
| "tokens/train_per_sec_per_gpu": 133.67, |
| "tokens/trainable": 205365 |
| }, |
| { |
| "epoch": 1.0625, |
| "grad_norm": 0.7515849471092224, |
| "learning_rate": 0.00018022632564836948, |
| "loss": 1.4255881309509277, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.1603, |
| "step": 102, |
| "tokens/total": 208896, |
| "tokens/train_per_sec_per_gpu": 133.07, |
| "tokens/trainable": 207388 |
| }, |
| { |
| "epoch": 1.064516129032258, |
| "grad_norm": 0.8653085231781006, |
| "learning_rate": 0.0001798466064637214, |
| "loss": 2.1312050819396973, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.42501, |
| "step": 103, |
| "tokens/total": 210944, |
| "tokens/train_per_sec_per_gpu": 135.16, |
| "tokens/trainable": 209420 |
| }, |
| { |
| "epoch": 1.066532258064516, |
| "grad_norm": 0.8592569231987, |
| "learning_rate": 0.00017946368402487845, |
| "loss": 1.9539296627044678, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.05636, |
| "step": 104, |
| "tokens/total": 212992, |
| "tokens/train_per_sec_per_gpu": 131.63, |
| "tokens/trainable": 211462 |
| }, |
| { |
| "epoch": 1.0685483870967742, |
| "grad_norm": 0.7780154347419739, |
| "learning_rate": 0.00017907757369376985, |
| "loss": 1.9227118492126465, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.83948, |
| "step": 105, |
| "tokens/total": 215040, |
| "tokens/train_per_sec_per_gpu": 134.27, |
| "tokens/trainable": 213491 |
| }, |
| { |
| "epoch": 1.0705645161290323, |
| "grad_norm": 0.7553818225860596, |
| "learning_rate": 0.00017868829096021527, |
| "loss": 1.7215089797973633, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.59296, |
| "step": 106, |
| "tokens/total": 217088, |
| "tokens/train_per_sec_per_gpu": 134.03, |
| "tokens/trainable": 215522 |
| }, |
| { |
| "epoch": 1.0725806451612903, |
| "grad_norm": 0.8612887263298035, |
| "learning_rate": 0.00017829585144130356, |
| "loss": 2.0418860912323, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.70513, |
| "step": 107, |
| "tokens/total": 219136, |
| "tokens/train_per_sec_per_gpu": 134.46, |
| "tokens/trainable": 217548 |
| }, |
| { |
| "epoch": 1.0745967741935485, |
| "grad_norm": 0.7575510740280151, |
| "learning_rate": 0.0001779002708807662, |
| "loss": 1.6365363597869873, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.13734, |
| "step": 108, |
| "tokens/total": 221184, |
| "tokens/train_per_sec_per_gpu": 137.71, |
| "tokens/trainable": 219593 |
| }, |
| { |
| "epoch": 1.0766129032258065, |
| "grad_norm": 0.8436228632926941, |
| "learning_rate": 0.0001775015651483459, |
| "loss": 2.169118881225586, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.75057, |
| "step": 109, |
| "tokens/total": 223232, |
| "tokens/train_per_sec_per_gpu": 134.87, |
| "tokens/trainable": 221619 |
| }, |
| { |
| "epoch": 1.0786290322580645, |
| "grad_norm": 0.7953697443008423, |
| "learning_rate": 0.00017709975023915949, |
| "loss": 1.9425008296966553, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.97618, |
| "step": 110, |
| "tokens/total": 225280, |
| "tokens/train_per_sec_per_gpu": 137.3, |
| "tokens/trainable": 223657 |
| }, |
| { |
| "epoch": 1.0806451612903225, |
| "grad_norm": 0.7815860509872437, |
| "learning_rate": 0.0001766948422730567, |
| "loss": 1.682219386100769, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.37748, |
| "step": 111, |
| "tokens/total": 227328, |
| "tokens/train_per_sec_per_gpu": 132.35, |
| "tokens/trainable": 225684 |
| }, |
| { |
| "epoch": 1.0826612903225807, |
| "grad_norm": 0.9288930892944336, |
| "learning_rate": 0.0001762868574939732, |
| "loss": 2.3112406730651855, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 10.08693, |
| "step": 112, |
| "tokens/total": 229376, |
| "tokens/train_per_sec_per_gpu": 135.23, |
| "tokens/trainable": 227725 |
| }, |
| { |
| "epoch": 1.0846774193548387, |
| "grad_norm": 0.8349924087524414, |
| "learning_rate": 0.0001758758122692791, |
| "loss": 1.882235050201416, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.56817, |
| "step": 113, |
| "tokens/total": 231424, |
| "tokens/train_per_sec_per_gpu": 134.29, |
| "tokens/trainable": 229762 |
| }, |
| { |
| "epoch": 1.0866935483870968, |
| "grad_norm": 0.7674592137336731, |
| "learning_rate": 0.00017546172308912213, |
| "loss": 1.8356249332427979, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.26905, |
| "step": 114, |
| "tokens/total": 233472, |
| "tokens/train_per_sec_per_gpu": 135.38, |
| "tokens/trainable": 231797 |
| }, |
| { |
| "epoch": 1.0887096774193548, |
| "grad_norm": 0.7179352641105652, |
| "learning_rate": 0.00017504460656576627, |
| "loss": 1.7059075832366943, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.50638, |
| "step": 115, |
| "tokens/total": 235520, |
| "tokens/train_per_sec_per_gpu": 134.51, |
| "tokens/trainable": 233832 |
| }, |
| { |
| "epoch": 1.090725806451613, |
| "grad_norm": 0.8075538873672485, |
| "learning_rate": 0.0001746244794329252, |
| "loss": 2.1125102043151855, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.26897, |
| "step": 116, |
| "tokens/total": 237568, |
| "tokens/train_per_sec_per_gpu": 132.98, |
| "tokens/trainable": 235856 |
| }, |
| { |
| "epoch": 1.092741935483871, |
| "grad_norm": 0.735394299030304, |
| "learning_rate": 0.0001742013585450911, |
| "loss": 1.6673572063446045, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.29815, |
| "step": 117, |
| "tokens/total": 239616, |
| "tokens/train_per_sec_per_gpu": 133.83, |
| "tokens/trainable": 237885 |
| }, |
| { |
| "epoch": 1.094758064516129, |
| "grad_norm": 0.7311357259750366, |
| "learning_rate": 0.00017377526087685832, |
| "loss": 1.642756462097168, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.1694, |
| "step": 118, |
| "tokens/total": 241664, |
| "tokens/train_per_sec_per_gpu": 135.95, |
| "tokens/trainable": 239928 |
| }, |
| { |
| "epoch": 1.096774193548387, |
| "grad_norm": 0.7718885540962219, |
| "learning_rate": 0.0001733462035222426, |
| "loss": 1.8600523471832275, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.42407, |
| "step": 119, |
| "tokens/total": 243712, |
| "tokens/train_per_sec_per_gpu": 133.17, |
| "tokens/trainable": 241964 |
| }, |
| { |
| "epoch": 1.0987903225806452, |
| "grad_norm": 0.7755111455917358, |
| "learning_rate": 0.0001729142036939951, |
| "loss": 1.783015489578247, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.94776, |
| "step": 120, |
| "tokens/total": 245760, |
| "tokens/train_per_sec_per_gpu": 134.83, |
| "tokens/trainable": 243989 |
| }, |
| { |
| "epoch": 1.1008064516129032, |
| "grad_norm": 0.7631067633628845, |
| "learning_rate": 0.000172479278722912, |
| "loss": 1.6326531171798706, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.11743, |
| "step": 121, |
| "tokens/total": 247808, |
| "tokens/train_per_sec_per_gpu": 133.96, |
| "tokens/trainable": 246025 |
| }, |
| { |
| "epoch": 1.1028225806451613, |
| "grad_norm": 0.6940619349479675, |
| "learning_rate": 0.0001720414460571392, |
| "loss": 1.6045302152633667, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.97552, |
| "step": 122, |
| "tokens/total": 249856, |
| "tokens/train_per_sec_per_gpu": 131.93, |
| "tokens/trainable": 248049 |
| }, |
| { |
| "epoch": 1.1048387096774193, |
| "grad_norm": 0.7802858948707581, |
| "learning_rate": 0.0001716007232614723, |
| "loss": 1.4858357906341553, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.41866, |
| "step": 123, |
| "tokens/total": 251904, |
| "tokens/train_per_sec_per_gpu": 132.63, |
| "tokens/trainable": 250089 |
| }, |
| { |
| "epoch": 1.1068548387096775, |
| "grad_norm": 0.872261643409729, |
| "learning_rate": 0.000171157128016652, |
| "loss": 1.9680781364440918, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.15691, |
| "step": 124, |
| "tokens/total": 253952, |
| "tokens/train_per_sec_per_gpu": 134.69, |
| "tokens/trainable": 252120 |
| }, |
| { |
| "epoch": 1.1088709677419355, |
| "grad_norm": 0.814172625541687, |
| "learning_rate": 0.00017071067811865476, |
| "loss": 1.9667139053344727, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.14715, |
| "step": 125, |
| "tokens/total": 256000, |
| "tokens/train_per_sec_per_gpu": 131.2, |
| "tokens/trainable": 254151 |
| }, |
| { |
| "epoch": 1.1108870967741935, |
| "grad_norm": 0.7579799890518188, |
| "learning_rate": 0.0001702613914779789, |
| "loss": 1.8390204906463623, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.29037, |
| "step": 126, |
| "tokens/total": 258048, |
| "tokens/train_per_sec_per_gpu": 130.65, |
| "tokens/trainable": 256197 |
| }, |
| { |
| "epoch": 1.1129032258064515, |
| "grad_norm": 0.89185631275177, |
| "learning_rate": 0.0001698092861189259, |
| "loss": 2.3891618251800537, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 10.90435, |
| "step": 127, |
| "tokens/total": 260096, |
| "tokens/train_per_sec_per_gpu": 131.31, |
| "tokens/trainable": 258228 |
| }, |
| { |
| "epoch": 1.1149193548387097, |
| "grad_norm": 0.8794082999229431, |
| "learning_rate": 0.00016935438017887772, |
| "loss": 2.1794254779815674, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.84123, |
| "step": 128, |
| "tokens/total": 262144, |
| "tokens/train_per_sec_per_gpu": 132.45, |
| "tokens/trainable": 260247 |
| }, |
| { |
| "epoch": 1.1169354838709677, |
| "grad_norm": 0.7506632804870605, |
| "learning_rate": 0.00016889669190756868, |
| "loss": 1.693035364151001, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.43596, |
| "step": 129, |
| "tokens/total": 264192, |
| "tokens/train_per_sec_per_gpu": 133.97, |
| "tokens/trainable": 262269 |
| }, |
| { |
| "epoch": 1.1189516129032258, |
| "grad_norm": 0.718761146068573, |
| "learning_rate": 0.00016843623966635366, |
| "loss": 1.2158582210540771, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.37319, |
| "step": 130, |
| "tokens/total": 266240, |
| "tokens/train_per_sec_per_gpu": 129.03, |
| "tokens/trainable": 264279 |
| }, |
| { |
| "epoch": 1.120967741935484, |
| "grad_norm": 0.9651544690132141, |
| "learning_rate": 0.0001679730419274713, |
| "loss": 2.0829291343688965, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.02795, |
| "step": 131, |
| "tokens/total": 268288, |
| "tokens/train_per_sec_per_gpu": 126.71, |
| "tokens/trainable": 266304 |
| }, |
| { |
| "epoch": 1.122983870967742, |
| "grad_norm": 0.833846390247345, |
| "learning_rate": 0.0001675071172733031, |
| "loss": 1.8694937229156494, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.48501, |
| "step": 132, |
| "tokens/total": 270336, |
| "tokens/train_per_sec_per_gpu": 128.42, |
| "tokens/trainable": 268327 |
| }, |
| { |
| "epoch": 1.125, |
| "grad_norm": 0.8215437531471252, |
| "learning_rate": 0.00016703848439562785, |
| "loss": 1.878624677658081, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.5445, |
| "step": 133, |
| "tokens/total": 272384, |
| "tokens/train_per_sec_per_gpu": 130.51, |
| "tokens/trainable": 270345 |
| }, |
| { |
| "epoch": 1.127016129032258, |
| "grad_norm": 0.8171601891517639, |
| "learning_rate": 0.00016656716209487174, |
| "loss": 1.9828517436981201, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.26343, |
| "step": 134, |
| "tokens/total": 274432, |
| "tokens/train_per_sec_per_gpu": 134.93, |
| "tokens/trainable": 272374 |
| }, |
| { |
| "epoch": 1.129032258064516, |
| "grad_norm": 0.966566801071167, |
| "learning_rate": 0.0001660931692793541, |
| "loss": 2.0265626907348633, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.58796, |
| "step": 135, |
| "tokens/total": 276480, |
| "tokens/train_per_sec_per_gpu": 133.85, |
| "tokens/trainable": 274398 |
| }, |
| { |
| "epoch": 1.1310483870967742, |
| "grad_norm": 0.9244574904441833, |
| "learning_rate": 0.000165616524964529, |
| "loss": 2.0933644771575928, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.11216, |
| "step": 136, |
| "tokens/total": 278528, |
| "tokens/train_per_sec_per_gpu": 10.87, |
| "tokens/trainable": 276439 |
| }, |
| { |
| "epoch": 1.1330645161290323, |
| "grad_norm": 0.8794076442718506, |
| "learning_rate": 0.00016513724827222227, |
| "loss": 2.0877299308776855, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.06658, |
| "step": 137, |
| "tokens/total": 280576, |
| "tokens/train_per_sec_per_gpu": 65.05, |
| "tokens/trainable": 278477 |
| }, |
| { |
| "epoch": 1.1350806451612903, |
| "grad_norm": 0.8859477639198303, |
| "learning_rate": 0.00016465535842986434, |
| "loss": 2.188518524169922, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.92199, |
| "step": 138, |
| "tokens/total": 282624, |
| "tokens/train_per_sec_per_gpu": 84.63, |
| "tokens/trainable": 280494 |
| }, |
| { |
| "epoch": 1.1370967741935485, |
| "grad_norm": 0.8838194012641907, |
| "learning_rate": 0.000164170874769719, |
| "loss": 1.8421143293380737, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.30987, |
| "step": 139, |
| "tokens/total": 284672, |
| "tokens/train_per_sec_per_gpu": 87.05, |
| "tokens/trainable": 282504 |
| }, |
| { |
| "epoch": 1.1391129032258065, |
| "grad_norm": 0.7671190500259399, |
| "learning_rate": 0.00016368381672810786, |
| "loss": 1.5053142309188843, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.50557, |
| "step": 140, |
| "tokens/total": 286720, |
| "tokens/train_per_sec_per_gpu": 87.13, |
| "tokens/trainable": 284532 |
| }, |
| { |
| "epoch": 1.1411290322580645, |
| "grad_norm": 1.073824167251587, |
| "learning_rate": 0.0001631942038446304, |
| "loss": 2.0518157482147217, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.78202, |
| "step": 141, |
| "tokens/total": 288768, |
| "tokens/train_per_sec_per_gpu": 86.56, |
| "tokens/trainable": 286491 |
| }, |
| { |
| "epoch": 1.1431451612903225, |
| "grad_norm": 0.9559262990951538, |
| "learning_rate": 0.00016270205576138032, |
| "loss": 1.7970073223114014, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.03157, |
| "step": 142, |
| "tokens/total": 290816, |
| "tokens/train_per_sec_per_gpu": 63.19, |
| "tokens/trainable": 287984 |
| }, |
| { |
| "epoch": 2.002016129032258, |
| "grad_norm": 0.9580713510513306, |
| "learning_rate": 0.00016220739222215738, |
| "loss": 2.409911632537842, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 11.13298, |
| "step": 143, |
| "tokens/total": 292864, |
| "tokens/train_per_sec_per_gpu": 84.37, |
| "tokens/trainable": 290021 |
| }, |
| { |
| "epoch": 2.004032258064516, |
| "grad_norm": 0.7835400700569153, |
| "learning_rate": 0.00016171023307167545, |
| "loss": 1.5976440906524658, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.94138, |
| "step": 144, |
| "tokens/total": 294912, |
| "tokens/train_per_sec_per_gpu": 88.19, |
| "tokens/trainable": 292069 |
| }, |
| { |
| "epoch": 2.006048387096774, |
| "grad_norm": 0.8442096710205078, |
| "learning_rate": 0.0001612105982547663, |
| "loss": 1.6255199909210205, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.08106, |
| "step": 145, |
| "tokens/total": 296960, |
| "tokens/train_per_sec_per_gpu": 92.67, |
| "tokens/trainable": 294108 |
| }, |
| { |
| "epoch": 2.0080645161290325, |
| "grad_norm": 0.7924631834030151, |
| "learning_rate": 0.00016070850781557948, |
| "loss": 1.660654067993164, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.26275, |
| "step": 146, |
| "tokens/total": 299008, |
| "tokens/train_per_sec_per_gpu": 86.3, |
| "tokens/trainable": 296150 |
| }, |
| { |
| "epoch": 2.0100806451612905, |
| "grad_norm": 0.8508074283599854, |
| "learning_rate": 0.0001602039818967783, |
| "loss": 1.522460699081421, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.58349, |
| "step": 147, |
| "tokens/total": 301056, |
| "tokens/train_per_sec_per_gpu": 90.44, |
| "tokens/trainable": 298194 |
| }, |
| { |
| "epoch": 2.0120967741935485, |
| "grad_norm": 0.8621785044670105, |
| "learning_rate": 0.00015969704073873157, |
| "loss": 1.6797025203704834, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.36396, |
| "step": 148, |
| "tokens/total": 303104, |
| "tokens/train_per_sec_per_gpu": 88.14, |
| "tokens/trainable": 300237 |
| }, |
| { |
| "epoch": 2.0141129032258065, |
| "grad_norm": 0.9323299527168274, |
| "learning_rate": 0.0001591877046787017, |
| "loss": 1.7540191411972046, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.77778, |
| "step": 149, |
| "tokens/total": 305152, |
| "tokens/train_per_sec_per_gpu": 89.25, |
| "tokens/trainable": 302278 |
| }, |
| { |
| "epoch": 2.0161290322580645, |
| "grad_norm": 0.9931126832962036, |
| "learning_rate": 0.00015867599415002895, |
| "loss": 1.766423225402832, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.84989, |
| "step": 150, |
| "tokens/total": 307200, |
| "tokens/train_per_sec_per_gpu": 89.97, |
| "tokens/trainable": 304305 |
| }, |
| { |
| "epoch": 2.0181451612903225, |
| "grad_norm": 1.1118701696395874, |
| "learning_rate": 0.00015816192968131138, |
| "loss": 2.2324070930480957, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 9.32228, |
| "step": 151, |
| "tokens/total": 309248, |
| "tokens/train_per_sec_per_gpu": 84.81, |
| "tokens/trainable": 306352 |
| }, |
| { |
| "epoch": 2.0201612903225805, |
| "grad_norm": 1.1531903743743896, |
| "learning_rate": 0.0001576455318955816, |
| "loss": 2.0641446113586426, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.87856, |
| "step": 152, |
| "tokens/total": 311296, |
| "tokens/train_per_sec_per_gpu": 91.08, |
| "tokens/trainable": 308400 |
| }, |
| { |
| "epoch": 2.0221774193548385, |
| "grad_norm": 0.9435677528381348, |
| "learning_rate": 0.00015712682150947923, |
| "loss": 1.8673259019851685, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.47097, |
| "step": 153, |
| "tokens/total": 313344, |
| "tokens/train_per_sec_per_gpu": 90.76, |
| "tokens/trainable": 310436 |
| }, |
| { |
| "epoch": 2.024193548387097, |
| "grad_norm": 0.8857016563415527, |
| "learning_rate": 0.00015660581933241993, |
| "loss": 1.4891250133514404, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.43321, |
| "step": 154, |
| "tokens/total": 315392, |
| "tokens/train_per_sec_per_gpu": 91.7, |
| "tokens/trainable": 312479 |
| }, |
| { |
| "epoch": 2.026209677419355, |
| "grad_norm": 1.0294502973556519, |
| "learning_rate": 0.00015608254626576048, |
| "loss": 2.35011625289917, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 10.48679, |
| "step": 155, |
| "tokens/total": 317440, |
| "tokens/train_per_sec_per_gpu": 91.75, |
| "tokens/trainable": 314520 |
| }, |
| { |
| "epoch": 2.028225806451613, |
| "grad_norm": 0.9770046472549438, |
| "learning_rate": 0.00015555702330196023, |
| "loss": 1.99894118309021, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.38124, |
| "step": 156, |
| "tokens/total": 319488, |
| "tokens/train_per_sec_per_gpu": 89.42, |
| "tokens/trainable": 316550 |
| }, |
| { |
| "epoch": 2.030241935483871, |
| "grad_norm": 0.8702968955039978, |
| "learning_rate": 0.00015502927152373914, |
| "loss": 1.7607452869415283, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.81677, |
| "step": 157, |
| "tokens/total": 321536, |
| "tokens/train_per_sec_per_gpu": 86.57, |
| "tokens/trainable": 318590 |
| }, |
| { |
| "epoch": 2.032258064516129, |
| "grad_norm": 0.8458526134490967, |
| "learning_rate": 0.0001544993121032318, |
| "loss": 1.7635177373886108, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.83292, |
| "step": 158, |
| "tokens/total": 323584, |
| "tokens/train_per_sec_per_gpu": 95.81, |
| "tokens/trainable": 320630 |
| }, |
| { |
| "epoch": 2.034274193548387, |
| "grad_norm": 0.9898233413696289, |
| "learning_rate": 0.000153967166301138, |
| "loss": 1.5542548894882202, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.73156, |
| "step": 159, |
| "tokens/total": 325632, |
| "tokens/train_per_sec_per_gpu": 90.09, |
| "tokens/trainable": 322660 |
| }, |
| { |
| "epoch": 2.036290322580645, |
| "grad_norm": 1.072844386100769, |
| "learning_rate": 0.00015343285546587013, |
| "loss": 1.4093410968780518, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.09326, |
| "step": 160, |
| "tokens/total": 327680, |
| "tokens/train_per_sec_per_gpu": 86.27, |
| "tokens/trainable": 324691 |
| }, |
| { |
| "epoch": 2.038306451612903, |
| "grad_norm": 0.9787235260009766, |
| "learning_rate": 0.00015289640103269625, |
| "loss": 1.877701997756958, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.53846, |
| "step": 161, |
| "tokens/total": 329728, |
| "tokens/train_per_sec_per_gpu": 87.58, |
| "tokens/trainable": 326732 |
| }, |
| { |
| "epoch": 2.0403225806451615, |
| "grad_norm": 0.8260992765426636, |
| "learning_rate": 0.00015235782452288068, |
| "loss": 1.4073071479797363, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.08494, |
| "step": 162, |
| "tokens/total": 331776, |
| "tokens/train_per_sec_per_gpu": 91.15, |
| "tokens/trainable": 328766 |
| }, |
| { |
| "epoch": 2.0423387096774195, |
| "grad_norm": 0.9031779766082764, |
| "learning_rate": 0.0001518171475428202, |
| "loss": 1.6873669624328613, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.40523, |
| "step": 163, |
| "tokens/total": 333824, |
| "tokens/train_per_sec_per_gpu": 87.34, |
| "tokens/trainable": 330799 |
| }, |
| { |
| "epoch": 2.0443548387096775, |
| "grad_norm": 0.9085680842399597, |
| "learning_rate": 0.00015127439178317745, |
| "loss": 1.860163688659668, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.42479, |
| "step": 164, |
| "tokens/total": 335872, |
| "tokens/train_per_sec_per_gpu": 88.02, |
| "tokens/trainable": 332832 |
| }, |
| { |
| "epoch": 2.0463709677419355, |
| "grad_norm": 1.0631558895111084, |
| "learning_rate": 0.00015072957901801076, |
| "loss": 1.7626943588256836, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.82812, |
| "step": 165, |
| "tokens/total": 337920, |
| "tokens/train_per_sec_per_gpu": 96.08, |
| "tokens/trainable": 334880 |
| }, |
| { |
| "epoch": 2.0483870967741935, |
| "grad_norm": 0.9707496166229248, |
| "learning_rate": 0.0001501827311039005, |
| "loss": 1.3545994758605957, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.87521, |
| "step": 166, |
| "tokens/total": 339968, |
| "tokens/train_per_sec_per_gpu": 95.22, |
| "tokens/trainable": 336918 |
| }, |
| { |
| "epoch": 2.0504032258064515, |
| "grad_norm": 0.9129533767700195, |
| "learning_rate": 0.0001496338699790724, |
| "loss": 1.9540798664093018, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.05742, |
| "step": 167, |
| "tokens/total": 342016, |
| "tokens/train_per_sec_per_gpu": 94.08, |
| "tokens/trainable": 338950 |
| }, |
| { |
| "epoch": 2.0524193548387095, |
| "grad_norm": 1.0040849447250366, |
| "learning_rate": 0.00014908301766251739, |
| "loss": 1.9150488376617432, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.78727, |
| "step": 168, |
| "tokens/total": 344064, |
| "tokens/train_per_sec_per_gpu": 93.34, |
| "tokens/trainable": 340985 |
| }, |
| { |
| "epoch": 2.0544354838709675, |
| "grad_norm": 0.892105221748352, |
| "learning_rate": 0.00014853019625310813, |
| "loss": 1.5278139114379883, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.60809, |
| "step": 169, |
| "tokens/total": 346112, |
| "tokens/train_per_sec_per_gpu": 91.28, |
| "tokens/trainable": 343029 |
| }, |
| { |
| "epoch": 2.056451612903226, |
| "grad_norm": 0.9372109174728394, |
| "learning_rate": 0.00014797542792871265, |
| "loss": 1.9480211734771729, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.01479, |
| "step": 170, |
| "tokens/total": 348160, |
| "tokens/train_per_sec_per_gpu": 89.14, |
| "tokens/trainable": 345066 |
| }, |
| { |
| "epoch": 2.058467741935484, |
| "grad_norm": 0.9851438403129578, |
| "learning_rate": 0.0001474187349453045, |
| "loss": 2.036619186401367, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.66465, |
| "step": 171, |
| "tokens/total": 350208, |
| "tokens/train_per_sec_per_gpu": 89.33, |
| "tokens/trainable": 347097 |
| }, |
| { |
| "epoch": 2.060483870967742, |
| "grad_norm": 1.0404151678085327, |
| "learning_rate": 0.00014686013963607, |
| "loss": 1.9593498706817627, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.09471, |
| "step": 172, |
| "tokens/total": 352256, |
| "tokens/train_per_sec_per_gpu": 90.31, |
| "tokens/trainable": 349138 |
| }, |
| { |
| "epoch": 2.0625, |
| "grad_norm": 0.9579296708106995, |
| "learning_rate": 0.00014629966441051208, |
| "loss": 1.647827386856079, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.19568, |
| "step": 173, |
| "tokens/total": 354304, |
| "tokens/train_per_sec_per_gpu": 88.57, |
| "tokens/trainable": 351184 |
| }, |
| { |
| "epoch": 2.064516129032258, |
| "grad_norm": 0.9718630909919739, |
| "learning_rate": 0.0001457373317535515, |
| "loss": 1.6372270584106445, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.14089, |
| "step": 174, |
| "tokens/total": 356352, |
| "tokens/train_per_sec_per_gpu": 84.71, |
| "tokens/trainable": 353227 |
| }, |
| { |
| "epoch": 2.066532258064516, |
| "grad_norm": 0.9516034126281738, |
| "learning_rate": 0.0001451731642246247, |
| "loss": 1.6231439113616943, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.069, |
| "step": 175, |
| "tokens/total": 358400, |
| "tokens/train_per_sec_per_gpu": 89.05, |
| "tokens/trainable": 355254 |
| }, |
| { |
| "epoch": 2.068548387096774, |
| "grad_norm": 0.9384471774101257, |
| "learning_rate": 0.00014460718445677876, |
| "loss": 1.7225630283355713, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.59886, |
| "step": 176, |
| "tokens/total": 360448, |
| "tokens/train_per_sec_per_gpu": 88.75, |
| "tokens/trainable": 357296 |
| }, |
| { |
| "epoch": 2.0705645161290325, |
| "grad_norm": 0.9477818012237549, |
| "learning_rate": 0.00014403941515576344, |
| "loss": 1.3852624893188477, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.99587, |
| "step": 177, |
| "tokens/total": 362496, |
| "tokens/train_per_sec_per_gpu": 92.17, |
| "tokens/trainable": 359324 |
| }, |
| { |
| "epoch": 2.0725806451612905, |
| "grad_norm": 1.0369067192077637, |
| "learning_rate": 0.00014346987909912023, |
| "loss": 1.883034348487854, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.57342, |
| "step": 178, |
| "tokens/total": 364544, |
| "tokens/train_per_sec_per_gpu": 92.07, |
| "tokens/trainable": 361370 |
| }, |
| { |
| "epoch": 2.0745967741935485, |
| "grad_norm": 1.0867875814437866, |
| "learning_rate": 0.00014289859913526874, |
| "loss": 2.1149420738220215, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.28911, |
| "step": 179, |
| "tokens/total": 366592, |
| "tokens/train_per_sec_per_gpu": 135.2, |
| "tokens/trainable": 363411 |
| }, |
| { |
| "epoch": 2.0766129032258065, |
| "grad_norm": 1.0646148920059204, |
| "learning_rate": 0.00014232559818258984, |
| "loss": 1.6198821067810059, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.05249, |
| "step": 180, |
| "tokens/total": 368640, |
| "tokens/train_per_sec_per_gpu": 135.66, |
| "tokens/trainable": 365443 |
| }, |
| { |
| "epoch": 2.0786290322580645, |
| "grad_norm": 0.9128944873809814, |
| "learning_rate": 0.00014175089922850633, |
| "loss": 1.5648690462112427, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.78205, |
| "step": 181, |
| "tokens/total": 370688, |
| "tokens/train_per_sec_per_gpu": 133.95, |
| "tokens/trainable": 367472 |
| }, |
| { |
| "epoch": 2.0806451612903225, |
| "grad_norm": 1.1343512535095215, |
| "learning_rate": 0.00014117452532856083, |
| "loss": 1.924842119216919, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.85407, |
| "step": 182, |
| "tokens/total": 372736, |
| "tokens/train_per_sec_per_gpu": 136.2, |
| "tokens/trainable": 369508 |
| }, |
| { |
| "epoch": 2.0826612903225805, |
| "grad_norm": 0.9804732799530029, |
| "learning_rate": 0.0001405964996054907, |
| "loss": 2.0507593154907227, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.7738, |
| "step": 183, |
| "tokens/total": 374784, |
| "tokens/train_per_sec_per_gpu": 133.13, |
| "tokens/trainable": 371540 |
| }, |
| { |
| "epoch": 2.0846774193548385, |
| "grad_norm": 1.1312826871871948, |
| "learning_rate": 0.00014001684524830057, |
| "loss": 1.557239055633545, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.7457, |
| "step": 184, |
| "tokens/total": 376832, |
| "tokens/train_per_sec_per_gpu": 136.66, |
| "tokens/trainable": 373580 |
| }, |
| { |
| "epoch": 2.086693548387097, |
| "grad_norm": 0.9265478253364563, |
| "learning_rate": 0.00013943558551133186, |
| "loss": 1.7553461790084839, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.78545, |
| "step": 185, |
| "tokens/total": 378880, |
| "tokens/train_per_sec_per_gpu": 135.57, |
| "tokens/trainable": 375627 |
| }, |
| { |
| "epoch": 2.088709677419355, |
| "grad_norm": 1.0380595922470093, |
| "learning_rate": 0.00013885274371333, |
| "loss": 1.7150152921676636, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.55676, |
| "step": 186, |
| "tokens/total": 380928, |
| "tokens/train_per_sec_per_gpu": 132.35, |
| "tokens/trainable": 377649 |
| }, |
| { |
| "epoch": 2.090725806451613, |
| "grad_norm": 1.0621381998062134, |
| "learning_rate": 0.000138268343236509, |
| "loss": 1.9849774837493896, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.27888, |
| "step": 187, |
| "tokens/total": 382976, |
| "tokens/train_per_sec_per_gpu": 131.15, |
| "tokens/trainable": 379649 |
| }, |
| { |
| "epoch": 2.092741935483871, |
| "grad_norm": 0.8717451691627502, |
| "learning_rate": 0.00013768240752561314, |
| "loss": 1.2543466091156006, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.50555, |
| "step": 188, |
| "tokens/total": 385024, |
| "tokens/train_per_sec_per_gpu": 134.1, |
| "tokens/trainable": 381680 |
| }, |
| { |
| "epoch": 2.094758064516129, |
| "grad_norm": 0.9604067206382751, |
| "learning_rate": 0.0001370949600869768, |
| "loss": 1.8013954162597656, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.0581, |
| "step": 189, |
| "tokens/total": 387072, |
| "tokens/train_per_sec_per_gpu": 132.56, |
| "tokens/trainable": 383719 |
| }, |
| { |
| "epoch": 2.096774193548387, |
| "grad_norm": 0.8730901479721069, |
| "learning_rate": 0.00013650602448758112, |
| "loss": 1.5253915786743164, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.59694, |
| "step": 190, |
| "tokens/total": 389120, |
| "tokens/train_per_sec_per_gpu": 131.79, |
| "tokens/trainable": 385752 |
| }, |
| { |
| "epoch": 2.098790322580645, |
| "grad_norm": 0.884779691696167, |
| "learning_rate": 0.0001359156243541087, |
| "loss": 1.194589376449585, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.3022, |
| "step": 191, |
| "tokens/total": 391168, |
| "tokens/train_per_sec_per_gpu": 134.36, |
| "tokens/trainable": 387786 |
| }, |
| { |
| "epoch": 2.100806451612903, |
| "grad_norm": 0.9849578142166138, |
| "learning_rate": 0.00013532378337199582, |
| "loss": 1.355954885482788, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.88046, |
| "step": 192, |
| "tokens/total": 393216, |
| "tokens/train_per_sec_per_gpu": 131.67, |
| "tokens/trainable": 389810 |
| }, |
| { |
| "epoch": 2.1028225806451615, |
| "grad_norm": 1.015483021736145, |
| "learning_rate": 0.00013473052528448201, |
| "loss": 1.7660787105560303, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.84788, |
| "step": 193, |
| "tokens/total": 395264, |
| "tokens/train_per_sec_per_gpu": 131.12, |
| "tokens/trainable": 391846 |
| }, |
| { |
| "epoch": 2.1048387096774195, |
| "grad_norm": 1.1974010467529297, |
| "learning_rate": 0.00013413587389165784, |
| "loss": 2.0104124546051025, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.4664, |
| "step": 194, |
| "tokens/total": 397312, |
| "tokens/train_per_sec_per_gpu": 132.58, |
| "tokens/trainable": 393887 |
| }, |
| { |
| "epoch": 2.1068548387096775, |
| "grad_norm": 0.8975300788879395, |
| "learning_rate": 0.00013353985304950973, |
| "loss": 1.717996597290039, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.57335, |
| "step": 195, |
| "tokens/total": 399360, |
| "tokens/train_per_sec_per_gpu": 131.46, |
| "tokens/trainable": 395920 |
| }, |
| { |
| "epoch": 2.1088709677419355, |
| "grad_norm": 0.9136057496070862, |
| "learning_rate": 0.00013294248666896328, |
| "loss": 1.597192406654358, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.93915, |
| "step": 196, |
| "tokens/total": 401408, |
| "tokens/train_per_sec_per_gpu": 132.82, |
| "tokens/trainable": 397953 |
| }, |
| { |
| "epoch": 2.1108870967741935, |
| "grad_norm": 0.8866783380508423, |
| "learning_rate": 0.0001323437987149238, |
| "loss": 1.5093090534210205, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.5236, |
| "step": 197, |
| "tokens/total": 403456, |
| "tokens/train_per_sec_per_gpu": 131.45, |
| "tokens/trainable": 399989 |
| }, |
| { |
| "epoch": 2.1129032258064515, |
| "grad_norm": 0.9615466594696045, |
| "learning_rate": 0.00013174381320531505, |
| "loss": 1.4923886060714722, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.44771, |
| "step": 198, |
| "tokens/total": 405504, |
| "tokens/train_per_sec_per_gpu": 137.67, |
| "tokens/trainable": 402007 |
| }, |
| { |
| "epoch": 2.1149193548387095, |
| "grad_norm": 1.0945857763290405, |
| "learning_rate": 0.0001311425542101154, |
| "loss": 1.8227579593658447, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.1889, |
| "step": 199, |
| "tokens/total": 407552, |
| "tokens/train_per_sec_per_gpu": 137.19, |
| "tokens/trainable": 404045 |
| }, |
| { |
| "epoch": 2.1169354838709675, |
| "grad_norm": 1.0081984996795654, |
| "learning_rate": 0.00013054004585039258, |
| "loss": 2.030510663986206, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.61798, |
| "step": 200, |
| "tokens/total": 409600, |
| "tokens/train_per_sec_per_gpu": 135.71, |
| "tokens/trainable": 406039 |
| }, |
| { |
| "epoch": 2.118951612903226, |
| "grad_norm": 0.9970818161964417, |
| "learning_rate": 0.00012993631229733582, |
| "loss": 1.589468002319336, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.90114, |
| "step": 201, |
| "tokens/total": 411648, |
| "tokens/train_per_sec_per_gpu": 134.62, |
| "tokens/trainable": 408035 |
| }, |
| { |
| "epoch": 2.120967741935484, |
| "grad_norm": 1.0629839897155762, |
| "learning_rate": 0.00012933137777128607, |
| "loss": 1.8885078430175781, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.6095, |
| "step": 202, |
| "tokens/total": 413696, |
| "tokens/train_per_sec_per_gpu": 137.54, |
| "tokens/trainable": 410049 |
| }, |
| { |
| "epoch": 2.122983870967742, |
| "grad_norm": 1.0215117931365967, |
| "learning_rate": 0.0001287252665407645, |
| "loss": 1.3511649370193481, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.86192, |
| "step": 203, |
| "tokens/total": 415744, |
| "tokens/train_per_sec_per_gpu": 136.73, |
| "tokens/trainable": 412060 |
| }, |
| { |
| "epoch": 2.125, |
| "grad_norm": 0.9783656597137451, |
| "learning_rate": 0.0001281180029214988, |
| "loss": 1.569549798965454, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.80448, |
| "step": 204, |
| "tokens/total": 417792, |
| "tokens/train_per_sec_per_gpu": 134.42, |
| "tokens/trainable": 414059 |
| }, |
| { |
| "epoch": 2.127016129032258, |
| "grad_norm": 0.9745059609413147, |
| "learning_rate": 0.0001275096112754478, |
| "loss": 1.5478930473327637, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.70155, |
| "step": 205, |
| "tokens/total": 419840, |
| "tokens/train_per_sec_per_gpu": 136.01, |
| "tokens/trainable": 416092 |
| }, |
| { |
| "epoch": 2.129032258064516, |
| "grad_norm": 1.0262490510940552, |
| "learning_rate": 0.000126900116009824, |
| "loss": 1.683368444442749, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.38366, |
| "step": 206, |
| "tokens/total": 421888, |
| "tokens/train_per_sec_per_gpu": 136.5, |
| "tokens/trainable": 418128 |
| }, |
| { |
| "epoch": 2.131048387096774, |
| "grad_norm": 0.885127604007721, |
| "learning_rate": 0.0001262895415761145, |
| "loss": 1.145145058631897, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.1429, |
| "step": 207, |
| "tokens/total": 423936, |
| "tokens/train_per_sec_per_gpu": 135.34, |
| "tokens/trainable": 420138 |
| }, |
| { |
| "epoch": 2.133064516129032, |
| "grad_norm": 1.0641528367996216, |
| "learning_rate": 0.00012567791246909994, |
| "loss": 1.5435967445373535, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.6814, |
| "step": 208, |
| "tokens/total": 425984, |
| "tokens/train_per_sec_per_gpu": 135.8, |
| "tokens/trainable": 422149 |
| }, |
| { |
| "epoch": 2.1350806451612905, |
| "grad_norm": 1.146147608757019, |
| "learning_rate": 0.00012506525322587207, |
| "loss": 1.7838163375854492, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.95253, |
| "step": 209, |
| "tokens/total": 428032, |
| "tokens/train_per_sec_per_gpu": 136.69, |
| "tokens/trainable": 424159 |
| }, |
| { |
| "epoch": 2.1370967741935485, |
| "grad_norm": 1.203956961631775, |
| "learning_rate": 0.0001244515884248491, |
| "loss": 1.9419140815734863, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.97208, |
| "step": 210, |
| "tokens/total": 430080, |
| "tokens/train_per_sec_per_gpu": 140.93, |
| "tokens/trainable": 426199 |
| }, |
| { |
| "epoch": 2.1391129032258065, |
| "grad_norm": 1.015429139137268, |
| "learning_rate": 0.00012383694268478993, |
| "loss": 1.591796875, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.91257, |
| "step": 211, |
| "tokens/total": 432128, |
| "tokens/train_per_sec_per_gpu": 139.89, |
| "tokens/trainable": 428188 |
| }, |
| { |
| "epoch": 2.1411290322580645, |
| "grad_norm": 1.0179929733276367, |
| "learning_rate": 0.0001232213406638062, |
| "loss": 1.5322093963623047, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.62839, |
| "step": 212, |
| "tokens/total": 434176, |
| "tokens/train_per_sec_per_gpu": 134.26, |
| "tokens/trainable": 430120 |
| }, |
| { |
| "epoch": 2.1431451612903225, |
| "grad_norm": 3.186521530151367, |
| "learning_rate": 0.0001226048070583735, |
| "loss": 1.9480620622634888, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.01508, |
| "step": 213, |
| "tokens/total": 436224, |
| "tokens/train_per_sec_per_gpu": 38.63, |
| "tokens/trainable": 430680 |
| }, |
| { |
| "epoch": 3.002016129032258, |
| "grad_norm": 0.9344216585159302, |
| "learning_rate": 0.00012198736660234009, |
| "loss": 1.3694523572921753, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.9332, |
| "step": 214, |
| "tokens/total": 438272, |
| "tokens/train_per_sec_per_gpu": 139.4, |
| "tokens/trainable": 432721 |
| }, |
| { |
| "epoch": 3.004032258064516, |
| "grad_norm": 1.053391456604004, |
| "learning_rate": 0.00012136904406593507, |
| "loss": 1.7003636360168457, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.47594, |
| "step": 215, |
| "tokens/total": 440320, |
| "tokens/train_per_sec_per_gpu": 141.09, |
| "tokens/trainable": 434759 |
| }, |
| { |
| "epoch": 3.006048387096774, |
| "grad_norm": 1.0795952081680298, |
| "learning_rate": 0.00012074986425477445, |
| "loss": 1.7129063606262207, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.54505, |
| "step": 216, |
| "tokens/total": 442368, |
| "tokens/train_per_sec_per_gpu": 140.09, |
| "tokens/trainable": 436801 |
| }, |
| { |
| "epoch": 3.0080645161290325, |
| "grad_norm": 0.9860684275627136, |
| "learning_rate": 0.00012012985200886602, |
| "loss": 1.1881208419799805, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.28091, |
| "step": 217, |
| "tokens/total": 444416, |
| "tokens/train_per_sec_per_gpu": 142.8, |
| "tokens/trainable": 438848 |
| }, |
| { |
| "epoch": 3.0100806451612905, |
| "grad_norm": 1.1295051574707031, |
| "learning_rate": 0.00011950903220161285, |
| "loss": 1.8362010717391968, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.27266, |
| "step": 218, |
| "tokens/total": 446464, |
| "tokens/train_per_sec_per_gpu": 142.72, |
| "tokens/trainable": 440887 |
| }, |
| { |
| "epoch": 3.0120967741935485, |
| "grad_norm": 1.1178804636001587, |
| "learning_rate": 0.00011888742973881543, |
| "loss": 1.6956043243408203, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.44994, |
| "step": 219, |
| "tokens/total": 448512, |
| "tokens/train_per_sec_per_gpu": 142.15, |
| "tokens/trainable": 442922 |
| }, |
| { |
| "epoch": 3.0141129032258065, |
| "grad_norm": 1.0459589958190918, |
| "learning_rate": 0.00011826506955767258, |
| "loss": 1.6698713302612305, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.31148, |
| "step": 220, |
| "tokens/total": 450560, |
| "tokens/train_per_sec_per_gpu": 140.62, |
| "tokens/trainable": 444956 |
| }, |
| { |
| "epoch": 3.0161290322580645, |
| "grad_norm": 1.1543166637420654, |
| "learning_rate": 0.00011764197662578086, |
| "loss": 1.579270839691162, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.85142, |
| "step": 221, |
| "tokens/total": 452608, |
| "tokens/train_per_sec_per_gpu": 140.09, |
| "tokens/trainable": 446986 |
| }, |
| { |
| "epoch": 3.0181451612903225, |
| "grad_norm": 1.282638430595398, |
| "learning_rate": 0.00011701817594013312, |
| "loss": 1.6817176342010498, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.37478, |
| "step": 222, |
| "tokens/total": 454656, |
| "tokens/train_per_sec_per_gpu": 143.82, |
| "tokens/trainable": 449023 |
| }, |
| { |
| "epoch": 3.0201612903225805, |
| "grad_norm": 1.0392447710037231, |
| "learning_rate": 0.00011639369252611552, |
| "loss": 1.2293877601623535, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.41914, |
| "step": 223, |
| "tokens/total": 456704, |
| "tokens/train_per_sec_per_gpu": 141.07, |
| "tokens/trainable": 451053 |
| }, |
| { |
| "epoch": 3.0221774193548385, |
| "grad_norm": 1.2307904958724976, |
| "learning_rate": 0.00011576855143650371, |
| "loss": 1.8099391460418701, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.11008, |
| "step": 224, |
| "tokens/total": 458752, |
| "tokens/train_per_sec_per_gpu": 141.68, |
| "tokens/trainable": 453087 |
| }, |
| { |
| "epoch": 3.024193548387097, |
| "grad_norm": 1.2422001361846924, |
| "learning_rate": 0.00011514277775045768, |
| "loss": 1.6516501903533936, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.21558, |
| "step": 225, |
| "tokens/total": 460800, |
| "tokens/train_per_sec_per_gpu": 138.34, |
| "tokens/trainable": 455113 |
| }, |
| { |
| "epoch": 3.026209677419355, |
| "grad_norm": 1.1288433074951172, |
| "learning_rate": 0.00011451639657251563, |
| "loss": 1.3742070198059082, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.95194, |
| "step": 226, |
| "tokens/total": 462848, |
| "tokens/train_per_sec_per_gpu": 140.84, |
| "tokens/trainable": 457161 |
| }, |
| { |
| "epoch": 3.028225806451613, |
| "grad_norm": 1.1234395503997803, |
| "learning_rate": 0.00011388943303158693, |
| "loss": 1.5823338031768799, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.8663, |
| "step": 227, |
| "tokens/total": 464896, |
| "tokens/train_per_sec_per_gpu": 139.68, |
| "tokens/trainable": 459193 |
| }, |
| { |
| "epoch": 3.030241935483871, |
| "grad_norm": 1.0792587995529175, |
| "learning_rate": 0.00011326191227994391, |
| "loss": 1.3610866069793701, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.90043, |
| "step": 228, |
| "tokens/total": 466944, |
| "tokens/train_per_sec_per_gpu": 141.05, |
| "tokens/trainable": 461236 |
| }, |
| { |
| "epoch": 3.032258064516129, |
| "grad_norm": 1.1624300479888916, |
| "learning_rate": 0.00011263385949221295, |
| "loss": 1.6541019678115845, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.22838, |
| "step": 229, |
| "tokens/total": 468992, |
| "tokens/train_per_sec_per_gpu": 138.91, |
| "tokens/trainable": 463270 |
| }, |
| { |
| "epoch": 3.034274193548387, |
| "grad_norm": 1.2286807298660278, |
| "learning_rate": 0.0001120052998643643, |
| "loss": 1.3511790037155151, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.86198, |
| "step": 230, |
| "tokens/total": 471040, |
| "tokens/train_per_sec_per_gpu": 139.55, |
| "tokens/trainable": 465306 |
| }, |
| { |
| "epoch": 3.036290322580645, |
| "grad_norm": 1.059576153755188, |
| "learning_rate": 0.00011137625861270151, |
| "loss": 1.3441061973571777, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.83476, |
| "step": 231, |
| "tokens/total": 473088, |
| "tokens/train_per_sec_per_gpu": 140.71, |
| "tokens/trainable": 467339 |
| }, |
| { |
| "epoch": 3.038306451612903, |
| "grad_norm": 1.110609769821167, |
| "learning_rate": 0.00011074676097284973, |
| "loss": 1.3131623268127441, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.71791, |
| "step": 232, |
| "tokens/total": 475136, |
| "tokens/train_per_sec_per_gpu": 138.97, |
| "tokens/trainable": 469379 |
| }, |
| { |
| "epoch": 3.0403225806451615, |
| "grad_norm": 1.062983512878418, |
| "learning_rate": 0.00011011683219874323, |
| "loss": 1.384516954421997, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.9929, |
| "step": 233, |
| "tokens/total": 477184, |
| "tokens/train_per_sec_per_gpu": 139.52, |
| "tokens/trainable": 471411 |
| }, |
| { |
| "epoch": 3.0423387096774195, |
| "grad_norm": 1.2548089027404785, |
| "learning_rate": 0.00010948649756161246, |
| "loss": 1.588539719581604, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.89659, |
| "step": 234, |
| "tokens/total": 479232, |
| "tokens/train_per_sec_per_gpu": 139.35, |
| "tokens/trainable": 473446 |
| }, |
| { |
| "epoch": 3.0443548387096775, |
| "grad_norm": 1.2708343267440796, |
| "learning_rate": 0.00010885578234897003, |
| "loss": 1.839888572692871, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.29584, |
| "step": 235, |
| "tokens/total": 481280, |
| "tokens/train_per_sec_per_gpu": 144.7, |
| "tokens/trainable": 475488 |
| }, |
| { |
| "epoch": 3.0463709677419355, |
| "grad_norm": 1.2227643728256226, |
| "learning_rate": 0.00010822471186359639, |
| "loss": 1.7494804859161377, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.75161, |
| "step": 236, |
| "tokens/total": 483328, |
| "tokens/train_per_sec_per_gpu": 143.61, |
| "tokens/trainable": 477534 |
| }, |
| { |
| "epoch": 3.0483870967741935, |
| "grad_norm": 1.117796778678894, |
| "learning_rate": 0.00010759331142252462, |
| "loss": 1.2630927562713623, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.53634, |
| "step": 237, |
| "tokens/total": 485376, |
| "tokens/train_per_sec_per_gpu": 139.48, |
| "tokens/trainable": 479561 |
| }, |
| { |
| "epoch": 3.0504032258064515, |
| "grad_norm": 1.3211694955825806, |
| "learning_rate": 0.00010696160635602487, |
| "loss": 1.8685176372528076, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.47869, |
| "step": 238, |
| "tokens/total": 487424, |
| "tokens/train_per_sec_per_gpu": 140.92, |
| "tokens/trainable": 481592 |
| }, |
| { |
| "epoch": 3.0524193548387095, |
| "grad_norm": 1.2283082008361816, |
| "learning_rate": 0.00010632962200658815, |
| "loss": 1.4223217964172363, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.14674, |
| "step": 239, |
| "tokens/total": 489472, |
| "tokens/train_per_sec_per_gpu": 139.3, |
| "tokens/trainable": 483631 |
| }, |
| { |
| "epoch": 3.0544354838709675, |
| "grad_norm": 1.1250922679901123, |
| "learning_rate": 0.00010569738372790956, |
| "loss": 1.3267250061035156, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.76868, |
| "step": 240, |
| "tokens/total": 491520, |
| "tokens/train_per_sec_per_gpu": 135.89, |
| "tokens/trainable": 485675 |
| }, |
| { |
| "epoch": 3.056451612903226, |
| "grad_norm": 1.1296156644821167, |
| "learning_rate": 0.00010506491688387127, |
| "loss": 1.3787105083465576, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.96978, |
| "step": 241, |
| "tokens/total": 493568, |
| "tokens/train_per_sec_per_gpu": 144.49, |
| "tokens/trainable": 487712 |
| }, |
| { |
| "epoch": 3.058467741935484, |
| "grad_norm": 1.263743281364441, |
| "learning_rate": 0.000104432246847525, |
| "loss": 1.6385829448699951, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.14787, |
| "step": 242, |
| "tokens/total": 495616, |
| "tokens/train_per_sec_per_gpu": 141.59, |
| "tokens/trainable": 489754 |
| }, |
| { |
| "epoch": 3.060483870967742, |
| "grad_norm": 1.190499186515808, |
| "learning_rate": 0.00010379939900007393, |
| "loss": 1.273460030555725, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.57319, |
| "step": 243, |
| "tokens/total": 497664, |
| "tokens/train_per_sec_per_gpu": 141.11, |
| "tokens/trainable": 491796 |
| }, |
| { |
| "epoch": 3.0625, |
| "grad_norm": 1.3139567375183105, |
| "learning_rate": 0.00010316639872985472, |
| "loss": 1.6341391801834106, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.12504, |
| "step": 244, |
| "tokens/total": 499712, |
| "tokens/train_per_sec_per_gpu": 140.26, |
| "tokens/trainable": 493840 |
| }, |
| { |
| "epoch": 3.064516129032258, |
| "grad_norm": 1.2812219858169556, |
| "learning_rate": 0.00010253327143131879, |
| "loss": 1.3600250482559204, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.89629, |
| "step": 245, |
| "tokens/total": 501760, |
| "tokens/train_per_sec_per_gpu": 138.47, |
| "tokens/trainable": 495860 |
| }, |
| { |
| "epoch": 3.066532258064516, |
| "grad_norm": 1.2698646783828735, |
| "learning_rate": 0.00010190004250401368, |
| "loss": 1.666378140449524, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.29296, |
| "step": 246, |
| "tokens/total": 503808, |
| "tokens/train_per_sec_per_gpu": 141.99, |
| "tokens/trainable": 497886 |
| }, |
| { |
| "epoch": 3.068548387096774, |
| "grad_norm": 1.275866985321045, |
| "learning_rate": 0.00010126673735156402, |
| "loss": 1.4717791080474854, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.35698, |
| "step": 247, |
| "tokens/total": 505856, |
| "tokens/train_per_sec_per_gpu": 142.33, |
| "tokens/trainable": 499930 |
| }, |
| { |
| "epoch": 3.0705645161290325, |
| "grad_norm": 1.4462482929229736, |
| "learning_rate": 0.00010063338138065234, |
| "loss": 1.6472115516662598, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.19248, |
| "step": 248, |
| "tokens/total": 507904, |
| "tokens/train_per_sec_per_gpu": 143.7, |
| "tokens/trainable": 501964 |
| }, |
| { |
| "epoch": 3.0725806451612905, |
| "grad_norm": 1.291642189025879, |
| "learning_rate": 0.0001, |
| "loss": 2.117743968963623, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 8.31236, |
| "step": 249, |
| "tokens/total": 509952, |
| "tokens/train_per_sec_per_gpu": 139.27, |
| "tokens/trainable": 503995 |
| }, |
| { |
| "epoch": 3.0745967741935485, |
| "grad_norm": 1.065657138824463, |
| "learning_rate": 9.936661861934765e-05, |
| "loss": 1.7818100452423096, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.9406, |
| "step": 250, |
| "tokens/total": 512000, |
| "tokens/train_per_sec_per_gpu": 139.06, |
| "tokens/trainable": 506033 |
| }, |
| { |
| "epoch": 3.0766129032258065, |
| "grad_norm": 1.2712163925170898, |
| "learning_rate": 9.8733262648436e-05, |
| "loss": 1.7442567348480225, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.72165, |
| "step": 251, |
| "tokens/total": 514048, |
| "tokens/train_per_sec_per_gpu": 139.84, |
| "tokens/trainable": 508057 |
| }, |
| { |
| "epoch": 3.0786290322580645, |
| "grad_norm": 1.2973222732543945, |
| "learning_rate": 9.809995749598632e-05, |
| "loss": 1.9768915176391602, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.22026, |
| "step": 252, |
| "tokens/total": 516096, |
| "tokens/train_per_sec_per_gpu": 139.92, |
| "tokens/trainable": 510079 |
| }, |
| { |
| "epoch": 3.0806451612903225, |
| "grad_norm": 1.0970314741134644, |
| "learning_rate": 9.746672856868123e-05, |
| "loss": 1.2245832681655884, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.40275, |
| "step": 253, |
| "tokens/total": 518144, |
| "tokens/train_per_sec_per_gpu": 142.79, |
| "tokens/trainable": 512112 |
| }, |
| { |
| "epoch": 3.0826612903225805, |
| "grad_norm": 1.3336235284805298, |
| "learning_rate": 9.683360127014529e-05, |
| "loss": 1.5929476022720337, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.91822, |
| "step": 254, |
| "tokens/total": 520192, |
| "tokens/train_per_sec_per_gpu": 138.39, |
| "tokens/trainable": 514140 |
| }, |
| { |
| "epoch": 3.0846774193548385, |
| "grad_norm": 1.2752783298492432, |
| "learning_rate": 9.620060099992609e-05, |
| "loss": 1.6711721420288086, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.3184, |
| "step": 255, |
| "tokens/total": 522240, |
| "tokens/train_per_sec_per_gpu": 138.11, |
| "tokens/trainable": 516164 |
| }, |
| { |
| "epoch": 3.086693548387097, |
| "grad_norm": 1.0667186975479126, |
| "learning_rate": 9.556775315247501e-05, |
| "loss": 1.4244043827056885, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.15538, |
| "step": 256, |
| "tokens/total": 524288, |
| "tokens/train_per_sec_per_gpu": 142.64, |
| "tokens/trainable": 518204 |
| }, |
| { |
| "epoch": 3.088709677419355, |
| "grad_norm": 1.2738869190216064, |
| "learning_rate": 9.493508311612874e-05, |
| "loss": 1.912049412727356, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.76694, |
| "step": 257, |
| "tokens/total": 526336, |
| "tokens/train_per_sec_per_gpu": 140.83, |
| "tokens/trainable": 520239 |
| }, |
| { |
| "epoch": 3.090725806451613, |
| "grad_norm": 1.1487785577774048, |
| "learning_rate": 9.430261627209044e-05, |
| "loss": 1.6021625995635986, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.96376, |
| "step": 258, |
| "tokens/total": 528384, |
| "tokens/train_per_sec_per_gpu": 142.29, |
| "tokens/trainable": 522256 |
| }, |
| { |
| "epoch": 3.092741935483871, |
| "grad_norm": 1.1487421989440918, |
| "learning_rate": 9.367037799341187e-05, |
| "loss": 1.5816335678100586, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.86289, |
| "step": 259, |
| "tokens/total": 530432, |
| "tokens/train_per_sec_per_gpu": 144.34, |
| "tokens/trainable": 524301 |
| }, |
| { |
| "epoch": 3.094758064516129, |
| "grad_norm": 1.1304370164871216, |
| "learning_rate": 9.303839364397511e-05, |
| "loss": 1.5841972827911377, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.87538, |
| "step": 260, |
| "tokens/total": 532480, |
| "tokens/train_per_sec_per_gpu": 140.53, |
| "tokens/trainable": 526337 |
| }, |
| { |
| "epoch": 3.096774193548387, |
| "grad_norm": 1.2210973501205444, |
| "learning_rate": 9.24066885774754e-05, |
| "loss": 1.3856146335601807, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.99728, |
| "step": 261, |
| "tokens/total": 534528, |
| "tokens/train_per_sec_per_gpu": 142.27, |
| "tokens/trainable": 528372 |
| }, |
| { |
| "epoch": 3.098790322580645, |
| "grad_norm": 1.106105089187622, |
| "learning_rate": 9.177528813640362e-05, |
| "loss": 1.5088179111480713, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.52138, |
| "step": 262, |
| "tokens/total": 536576, |
| "tokens/train_per_sec_per_gpu": 137.11, |
| "tokens/trainable": 530386 |
| }, |
| { |
| "epoch": 3.100806451612903, |
| "grad_norm": 1.2509136199951172, |
| "learning_rate": 9.114421765102999e-05, |
| "loss": 1.6389458179473877, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.14974, |
| "step": 263, |
| "tokens/total": 538624, |
| "tokens/train_per_sec_per_gpu": 142.74, |
| "tokens/trainable": 532411 |
| }, |
| { |
| "epoch": 3.1028225806451615, |
| "grad_norm": 1.1185474395751953, |
| "learning_rate": 9.051350243838756e-05, |
| "loss": 1.4692389965057373, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.34593, |
| "step": 264, |
| "tokens/total": 540672, |
| "tokens/train_per_sec_per_gpu": 141.72, |
| "tokens/trainable": 534431 |
| }, |
| { |
| "epoch": 3.1048387096774195, |
| "grad_norm": 1.253171443939209, |
| "learning_rate": 8.98831678012568e-05, |
| "loss": 2.0175833702087402, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.52013, |
| "step": 265, |
| "tokens/total": 542720, |
| "tokens/train_per_sec_per_gpu": 143.19, |
| "tokens/trainable": 536470 |
| }, |
| { |
| "epoch": 3.1068548387096775, |
| "grad_norm": 1.2736040353775024, |
| "learning_rate": 8.925323902715031e-05, |
| "loss": 1.5984770059585571, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.94549, |
| "step": 266, |
| "tokens/total": 544768, |
| "tokens/train_per_sec_per_gpu": 140.92, |
| "tokens/trainable": 538506 |
| }, |
| { |
| "epoch": 3.1088709677419355, |
| "grad_norm": 1.1199898719787598, |
| "learning_rate": 8.862374138729853e-05, |
| "loss": 1.399961233139038, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.05504, |
| "step": 267, |
| "tokens/total": 546816, |
| "tokens/train_per_sec_per_gpu": 139.15, |
| "tokens/trainable": 540530 |
| }, |
| { |
| "epoch": 3.1108870967741935, |
| "grad_norm": 1.2822198867797852, |
| "learning_rate": 8.799470013563573e-05, |
| "loss": 1.5098600387573242, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.5261, |
| "step": 268, |
| "tokens/total": 548864, |
| "tokens/train_per_sec_per_gpu": 141.42, |
| "tokens/trainable": 542552 |
| }, |
| { |
| "epoch": 3.1129032258064515, |
| "grad_norm": 1.323458194732666, |
| "learning_rate": 8.73661405077871e-05, |
| "loss": 1.529092788696289, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.61399, |
| "step": 269, |
| "tokens/total": 550912, |
| "tokens/train_per_sec_per_gpu": 142.05, |
| "tokens/trainable": 544574 |
| }, |
| { |
| "epoch": 3.1149193548387095, |
| "grad_norm": 1.1802997589111328, |
| "learning_rate": 8.67380877200561e-05, |
| "loss": 1.5494701862335205, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.70897, |
| "step": 270, |
| "tokens/total": 552960, |
| "tokens/train_per_sec_per_gpu": 136.79, |
| "tokens/trainable": 546576 |
| }, |
| { |
| "epoch": 3.1169354838709675, |
| "grad_norm": 1.1339328289031982, |
| "learning_rate": 8.611056696841312e-05, |
| "loss": 1.4412262439727783, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.22587, |
| "step": 271, |
| "tokens/total": 555008, |
| "tokens/train_per_sec_per_gpu": 139.97, |
| "tokens/trainable": 548595 |
| }, |
| { |
| "epoch": 3.118951612903226, |
| "grad_norm": 1.184397578239441, |
| "learning_rate": 8.54836034274844e-05, |
| "loss": 1.4191689491271973, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.13368, |
| "step": 272, |
| "tokens/total": 557056, |
| "tokens/train_per_sec_per_gpu": 138.8, |
| "tokens/trainable": 550640 |
| }, |
| { |
| "epoch": 3.120967741935484, |
| "grad_norm": 1.1473497152328491, |
| "learning_rate": 8.485722224954237e-05, |
| "loss": 1.5693676471710205, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.80361, |
| "step": 273, |
| "tokens/total": 559104, |
| "tokens/train_per_sec_per_gpu": 139.3, |
| "tokens/trainable": 552674 |
| }, |
| { |
| "epoch": 3.122983870967742, |
| "grad_norm": 1.356634497642517, |
| "learning_rate": 8.423144856349631e-05, |
| "loss": 1.9328263998031616, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.90901, |
| "step": 274, |
| "tokens/total": 561152, |
| "tokens/train_per_sec_per_gpu": 137.86, |
| "tokens/trainable": 554682 |
| }, |
| { |
| "epoch": 3.125, |
| "grad_norm": 1.1992149353027344, |
| "learning_rate": 8.36063074738845e-05, |
| "loss": 1.3870232105255127, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.00292, |
| "step": 275, |
| "tokens/total": 563200, |
| "tokens/train_per_sec_per_gpu": 140.61, |
| "tokens/trainable": 556700 |
| }, |
| { |
| "epoch": 3.127016129032258, |
| "grad_norm": 1.2201882600784302, |
| "learning_rate": 8.298182405986689e-05, |
| "loss": 1.576523780822754, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.83811, |
| "step": 276, |
| "tokens/total": 565248, |
| "tokens/train_per_sec_per_gpu": 140.13, |
| "tokens/trainable": 558705 |
| }, |
| { |
| "epoch": 3.129032258064516, |
| "grad_norm": 1.4621694087982178, |
| "learning_rate": 8.235802337421919e-05, |
| "loss": 1.915595531463623, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.79098, |
| "step": 277, |
| "tokens/total": 567296, |
| "tokens/train_per_sec_per_gpu": 139.56, |
| "tokens/trainable": 560720 |
| }, |
| { |
| "epoch": 3.131048387096774, |
| "grad_norm": 1.246692180633545, |
| "learning_rate": 8.173493044232745e-05, |
| "loss": 1.5288515090942383, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.61288, |
| "step": 278, |
| "tokens/total": 569344, |
| "tokens/train_per_sec_per_gpu": 143.15, |
| "tokens/trainable": 562768 |
| }, |
| { |
| "epoch": 3.133064516129032, |
| "grad_norm": 1.1976420879364014, |
| "learning_rate": 8.11125702611846e-05, |
| "loss": 1.3603302240371704, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.89748, |
| "step": 279, |
| "tokens/total": 571392, |
| "tokens/train_per_sec_per_gpu": 137.69, |
| "tokens/trainable": 564802 |
| }, |
| { |
| "epoch": 3.1350806451612905, |
| "grad_norm": 1.2767409086227417, |
| "learning_rate": 8.049096779838719e-05, |
| "loss": 1.5446770191192627, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.68646, |
| "step": 280, |
| "tokens/total": 573440, |
| "tokens/train_per_sec_per_gpu": 136.27, |
| "tokens/trainable": 566813 |
| }, |
| { |
| "epoch": 3.1370967741935485, |
| "grad_norm": 1.2133939266204834, |
| "learning_rate": 7.987014799113397e-05, |
| "loss": 1.2143042087554932, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.36795, |
| "step": 281, |
| "tokens/total": 575488, |
| "tokens/train_per_sec_per_gpu": 134.72, |
| "tokens/trainable": 568792 |
| }, |
| { |
| "epoch": 3.1391129032258065, |
| "grad_norm": 1.0450326204299927, |
| "learning_rate": 7.925013574522557e-05, |
| "loss": 1.2417564392089844, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.46169, |
| "step": 282, |
| "tokens/total": 577536, |
| "tokens/train_per_sec_per_gpu": 137.68, |
| "tokens/trainable": 570791 |
| }, |
| { |
| "epoch": 3.1411290322580645, |
| "grad_norm": 1.2996821403503418, |
| "learning_rate": 7.863095593406491e-05, |
| "loss": 1.5991716384887695, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.94893, |
| "step": 283, |
| "tokens/total": 579584, |
| "tokens/train_per_sec_per_gpu": 134.29, |
| "tokens/trainable": 572699 |
| }, |
| { |
| "epoch": 3.1431451612903225, |
| "grad_norm": 1.655535101890564, |
| "learning_rate": 7.801263339765994e-05, |
| "loss": 1.2429203987121582, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.46572, |
| "step": 284, |
| "tokens/total": 581632, |
| "tokens/train_per_sec_per_gpu": 104.39, |
| "tokens/trainable": 574221 |
| }, |
| { |
| "epoch": 4.002016129032258, |
| "grad_norm": 1.274104356765747, |
| "learning_rate": 7.739519294162652e-05, |
| "loss": 1.5383883714675903, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.65708, |
| "step": 285, |
| "tokens/total": 583680, |
| "tokens/train_per_sec_per_gpu": 139.07, |
| "tokens/trainable": 576269 |
| }, |
| { |
| "epoch": 4.004032258064516, |
| "grad_norm": 1.2912395000457764, |
| "learning_rate": 7.677865933619379e-05, |
| "loss": 1.3963959217071533, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.04061, |
| "step": 286, |
| "tokens/total": 585728, |
| "tokens/train_per_sec_per_gpu": 139.66, |
| "tokens/trainable": 578298 |
| }, |
| { |
| "epoch": 4.006048387096774, |
| "grad_norm": 1.2305668592453003, |
| "learning_rate": 7.616305731521008e-05, |
| "loss": 1.466391921043396, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.33357, |
| "step": 287, |
| "tokens/total": 587776, |
| "tokens/train_per_sec_per_gpu": 232.49, |
| "tokens/trainable": 580337 |
| }, |
| { |
| "epoch": 4.008064516129032, |
| "grad_norm": 1.1913374662399292, |
| "learning_rate": 7.554841157515092e-05, |
| "loss": 1.312002420425415, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.7136, |
| "step": 288, |
| "tokens/total": 589824, |
| "tokens/train_per_sec_per_gpu": 263.64, |
| "tokens/trainable": 582382 |
| }, |
| { |
| "epoch": 4.01008064516129, |
| "grad_norm": 1.3715969324111938, |
| "learning_rate": 7.493474677412794e-05, |
| "loss": 1.5232388973236084, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.58706, |
| "step": 289, |
| "tokens/total": 591872, |
| "tokens/train_per_sec_per_gpu": 265.19, |
| "tokens/trainable": 584411 |
| }, |
| { |
| "epoch": 4.012096774193548, |
| "grad_norm": 1.2743836641311646, |
| "learning_rate": 7.432208753090009e-05, |
| "loss": 1.570101022720337, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.80713, |
| "step": 290, |
| "tokens/total": 593920, |
| "tokens/train_per_sec_per_gpu": 267.17, |
| "tokens/trainable": 586451 |
| }, |
| { |
| "epoch": 4.014112903225806, |
| "grad_norm": 1.1839017868041992, |
| "learning_rate": 7.371045842388552e-05, |
| "loss": 1.2354711294174194, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.44, |
| "step": 291, |
| "tokens/total": 595968, |
| "tokens/train_per_sec_per_gpu": 257.88, |
| "tokens/trainable": 588494 |
| }, |
| { |
| "epoch": 4.016129032258065, |
| "grad_norm": 1.5452686548233032, |
| "learning_rate": 7.309988399017602e-05, |
| "loss": 1.75980806350708, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.81132, |
| "step": 292, |
| "tokens/total": 598016, |
| "tokens/train_per_sec_per_gpu": 266.87, |
| "tokens/trainable": 590539 |
| }, |
| { |
| "epoch": 4.018145161290323, |
| "grad_norm": 1.3476964235305786, |
| "learning_rate": 7.24903887245522e-05, |
| "loss": 1.204418659210205, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.33482, |
| "step": 293, |
| "tokens/total": 600064, |
| "tokens/train_per_sec_per_gpu": 266.31, |
| "tokens/trainable": 592573 |
| }, |
| { |
| "epoch": 4.020161290322581, |
| "grad_norm": 1.3166759014129639, |
| "learning_rate": 7.188199707850122e-05, |
| "loss": 1.5485095977783203, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.70445, |
| "step": 294, |
| "tokens/total": 602112, |
| "tokens/train_per_sec_per_gpu": 262.47, |
| "tokens/trainable": 594600 |
| }, |
| { |
| "epoch": 4.022177419354839, |
| "grad_norm": 1.2978073358535767, |
| "learning_rate": 7.127473345923554e-05, |
| "loss": 1.2120771408081055, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.36046, |
| "step": 295, |
| "tokens/total": 604160, |
| "tokens/train_per_sec_per_gpu": 264.34, |
| "tokens/trainable": 596636 |
| }, |
| { |
| "epoch": 4.024193548387097, |
| "grad_norm": 1.4947535991668701, |
| "learning_rate": 7.066862222871397e-05, |
| "loss": 1.7942547798156738, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.01499, |
| "step": 296, |
| "tokens/total": 606208, |
| "tokens/train_per_sec_per_gpu": 262.53, |
| "tokens/trainable": 598664 |
| }, |
| { |
| "epoch": 4.026209677419355, |
| "grad_norm": 1.3672757148742676, |
| "learning_rate": 7.006368770266421e-05, |
| "loss": 1.1543655395507812, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.17201, |
| "step": 297, |
| "tokens/total": 608256, |
| "tokens/train_per_sec_per_gpu": 263.64, |
| "tokens/trainable": 600701 |
| }, |
| { |
| "epoch": 4.028225806451613, |
| "grad_norm": 1.3657294511795044, |
| "learning_rate": 6.945995414960744e-05, |
| "loss": 1.5515549182891846, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.7188, |
| "step": 298, |
| "tokens/total": 610304, |
| "tokens/train_per_sec_per_gpu": 266.24, |
| "tokens/trainable": 602742 |
| }, |
| { |
| "epoch": 4.030241935483871, |
| "grad_norm": 1.343180775642395, |
| "learning_rate": 6.885744578988463e-05, |
| "loss": 1.6137218475341797, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.02147, |
| "step": 299, |
| "tokens/total": 612352, |
| "tokens/train_per_sec_per_gpu": 262.83, |
| "tokens/trainable": 604776 |
| }, |
| { |
| "epoch": 4.032258064516129, |
| "grad_norm": 1.1807539463043213, |
| "learning_rate": 6.825618679468502e-05, |
| "loss": 0.9370715618133545, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.5525, |
| "step": 300, |
| "tokens/total": 614400, |
| "tokens/train_per_sec_per_gpu": 263.56, |
| "tokens/trainable": 606812 |
| }, |
| { |
| "epoch": 4.034274193548387, |
| "grad_norm": 1.3539948463439941, |
| "learning_rate": 6.765620128507619e-05, |
| "loss": 1.345663070678711, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.84073, |
| "step": 301, |
| "tokens/total": 616448, |
| "tokens/train_per_sec_per_gpu": 261.91, |
| "tokens/trainable": 608850 |
| }, |
| { |
| "epoch": 4.036290322580645, |
| "grad_norm": 1.2722002267837524, |
| "learning_rate": 6.705751333103675e-05, |
| "loss": 1.2931978702545166, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.64442, |
| "step": 302, |
| "tokens/total": 618496, |
| "tokens/train_per_sec_per_gpu": 268.26, |
| "tokens/trainable": 610889 |
| }, |
| { |
| "epoch": 4.038306451612903, |
| "grad_norm": 1.3646944761276245, |
| "learning_rate": 6.64601469504903e-05, |
| "loss": 1.4423154592514038, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.23048, |
| "step": 303, |
| "tokens/total": 620544, |
| "tokens/train_per_sec_per_gpu": 268.66, |
| "tokens/trainable": 612927 |
| }, |
| { |
| "epoch": 4.040322580645161, |
| "grad_norm": 1.234769344329834, |
| "learning_rate": 6.586412610834221e-05, |
| "loss": 1.2587189674377441, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.52091, |
| "step": 304, |
| "tokens/total": 622592, |
| "tokens/train_per_sec_per_gpu": 268.44, |
| "tokens/trainable": 614966 |
| }, |
| { |
| "epoch": 4.042338709677419, |
| "grad_norm": 1.3554316759109497, |
| "learning_rate": 6.526947471551798e-05, |
| "loss": 1.6908648014068604, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.42417, |
| "step": 305, |
| "tokens/total": 624640, |
| "tokens/train_per_sec_per_gpu": 268.41, |
| "tokens/trainable": 617010 |
| }, |
| { |
| "epoch": 4.044354838709677, |
| "grad_norm": 1.4698665142059326, |
| "learning_rate": 6.46762166280042e-05, |
| "loss": 1.8330029249191284, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 6.25263, |
| "step": 306, |
| "tokens/total": 626688, |
| "tokens/train_per_sec_per_gpu": 265.37, |
| "tokens/trainable": 619042 |
| }, |
| { |
| "epoch": 4.046370967741935, |
| "grad_norm": 1.3987410068511963, |
| "learning_rate": 6.40843756458913e-05, |
| "loss": 1.682692527770996, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.38002, |
| "step": 307, |
| "tokens/total": 628736, |
| "tokens/train_per_sec_per_gpu": 263.08, |
| "tokens/trainable": 621066 |
| }, |
| { |
| "epoch": 4.048387096774194, |
| "grad_norm": 1.5588736534118652, |
| "learning_rate": 6.349397551241894e-05, |
| "loss": 1.6790101528167725, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.36025, |
| "step": 308, |
| "tokens/total": 630784, |
| "tokens/train_per_sec_per_gpu": 265.49, |
| "tokens/trainable": 623102 |
| }, |
| { |
| "epoch": 4.050403225806452, |
| "grad_norm": 1.292420744895935, |
| "learning_rate": 6.290503991302324e-05, |
| "loss": 1.1997315883636475, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.31923, |
| "step": 309, |
| "tokens/total": 632832, |
| "tokens/train_per_sec_per_gpu": 263.41, |
| "tokens/trainable": 625144 |
| }, |
| { |
| "epoch": 4.05241935483871, |
| "grad_norm": 1.4492568969726562, |
| "learning_rate": 6.231759247438689e-05, |
| "loss": 1.3761956691741943, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.95981, |
| "step": 310, |
| "tokens/total": 634880, |
| "tokens/train_per_sec_per_gpu": 262.05, |
| "tokens/trainable": 627175 |
| }, |
| { |
| "epoch": 4.054435483870968, |
| "grad_norm": 1.437646508216858, |
| "learning_rate": 6.173165676349103e-05, |
| "loss": 1.2556071281433105, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.50997, |
| "step": 311, |
| "tokens/total": 636928, |
| "tokens/train_per_sec_per_gpu": 262.88, |
| "tokens/trainable": 629214 |
| }, |
| { |
| "epoch": 4.056451612903226, |
| "grad_norm": 1.372708797454834, |
| "learning_rate": 6.114725628666998e-05, |
| "loss": 1.3328940868377686, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.792, |
| "step": 312, |
| "tokens/total": 638976, |
| "tokens/train_per_sec_per_gpu": 264.5, |
| "tokens/trainable": 631258 |
| }, |
| { |
| "epoch": 4.058467741935484, |
| "grad_norm": 1.6981028318405151, |
| "learning_rate": 6.0564414488668165e-05, |
| "loss": 1.6544911861419678, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.23042, |
| "step": 313, |
| "tokens/total": 641024, |
| "tokens/train_per_sec_per_gpu": 262.31, |
| "tokens/trainable": 633287 |
| }, |
| { |
| "epoch": 4.060483870967742, |
| "grad_norm": 1.4426374435424805, |
| "learning_rate": 5.998315475169942e-05, |
| "loss": 1.2637913227081299, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.53881, |
| "step": 314, |
| "tokens/total": 643072, |
| "tokens/train_per_sec_per_gpu": 266.31, |
| "tokens/trainable": 635323 |
| }, |
| { |
| "epoch": 4.0625, |
| "grad_norm": 1.3725757598876953, |
| "learning_rate": 5.94035003945093e-05, |
| "loss": 1.5147466659545898, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.54827, |
| "step": 315, |
| "tokens/total": 645120, |
| "tokens/train_per_sec_per_gpu": 265.15, |
| "tokens/trainable": 637355 |
| }, |
| { |
| "epoch": 4.064516129032258, |
| "grad_norm": 1.3077338933944702, |
| "learning_rate": 5.88254746714392e-05, |
| "loss": 1.2489416599273682, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.48665, |
| "step": 316, |
| "tokens/total": 647168, |
| "tokens/train_per_sec_per_gpu": 254.63, |
| "tokens/trainable": 639396 |
| }, |
| { |
| "epoch": 4.066532258064516, |
| "grad_norm": 1.3260565996170044, |
| "learning_rate": 5.824910077149371e-05, |
| "loss": 1.153064489364624, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.16789, |
| "step": 317, |
| "tokens/total": 649216, |
| "tokens/train_per_sec_per_gpu": 265.77, |
| "tokens/trainable": 641426 |
| }, |
| { |
| "epoch": 4.068548387096774, |
| "grad_norm": 1.4490100145339966, |
| "learning_rate": 5.767440181741019e-05, |
| "loss": 1.2668461799621582, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.54964, |
| "step": 318, |
| "tokens/total": 651264, |
| "tokens/train_per_sec_per_gpu": 262.28, |
| "tokens/trainable": 643458 |
| }, |
| { |
| "epoch": 4.070564516129032, |
| "grad_norm": 1.4084233045578003, |
| "learning_rate": 5.710140086473129e-05, |
| "loss": 1.0618423223495483, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.89169, |
| "step": 319, |
| "tokens/total": 653312, |
| "tokens/train_per_sec_per_gpu": 262.96, |
| "tokens/trainable": 645481 |
| }, |
| { |
| "epoch": 4.07258064516129, |
| "grad_norm": 1.4210598468780518, |
| "learning_rate": 5.653012090087977e-05, |
| "loss": 1.278883457183838, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.59263, |
| "step": 320, |
| "tokens/total": 655360, |
| "tokens/train_per_sec_per_gpu": 264.32, |
| "tokens/trainable": 647514 |
| }, |
| { |
| "epoch": 4.074596774193548, |
| "grad_norm": 1.4964855909347534, |
| "learning_rate": 5.596058484423656e-05, |
| "loss": 1.4187018871307373, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.13175, |
| "step": 321, |
| "tokens/total": 657408, |
| "tokens/train_per_sec_per_gpu": 264.89, |
| "tokens/trainable": 649562 |
| }, |
| { |
| "epoch": 4.076612903225806, |
| "grad_norm": 1.5338218212127686, |
| "learning_rate": 5.5392815543221254e-05, |
| "loss": 1.6856354475021362, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.39588, |
| "step": 322, |
| "tokens/total": 659456, |
| "tokens/train_per_sec_per_gpu": 264.2, |
| "tokens/trainable": 651599 |
| }, |
| { |
| "epoch": 4.078629032258065, |
| "grad_norm": 1.441730260848999, |
| "learning_rate": 5.4826835775375285e-05, |
| "loss": 1.5783681869506836, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.84704, |
| "step": 323, |
| "tokens/total": 661504, |
| "tokens/train_per_sec_per_gpu": 265.01, |
| "tokens/trainable": 653646 |
| }, |
| { |
| "epoch": 4.080645161290323, |
| "grad_norm": 1.5422190427780151, |
| "learning_rate": 5.4262668246448475e-05, |
| "loss": 1.6311604976654053, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.1098, |
| "step": 324, |
| "tokens/total": 663552, |
| "tokens/train_per_sec_per_gpu": 262.57, |
| "tokens/trainable": 655681 |
| }, |
| { |
| "epoch": 4.082661290322581, |
| "grad_norm": 1.6251616477966309, |
| "learning_rate": 5.3700335589487925e-05, |
| "loss": 1.576082468032837, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.83597, |
| "step": 325, |
| "tokens/total": 665600, |
| "tokens/train_per_sec_per_gpu": 262.3, |
| "tokens/trainable": 657716 |
| }, |
| { |
| "epoch": 4.084677419354839, |
| "grad_norm": 1.5639586448669434, |
| "learning_rate": 5.3139860363929996e-05, |
| "loss": 1.213226556777954, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.36432, |
| "step": 326, |
| "tokens/total": 667648, |
| "tokens/train_per_sec_per_gpu": 260.53, |
| "tokens/trainable": 659728 |
| }, |
| { |
| "epoch": 4.086693548387097, |
| "grad_norm": 1.2871079444885254, |
| "learning_rate": 5.2581265054695494e-05, |
| "loss": 1.0847278833389282, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.95863, |
| "step": 327, |
| "tokens/total": 669696, |
| "tokens/train_per_sec_per_gpu": 257.33, |
| "tokens/trainable": 661749 |
| }, |
| { |
| "epoch": 4.088709677419355, |
| "grad_norm": 1.398889183998108, |
| "learning_rate": 5.202457207128736e-05, |
| "loss": 1.311136245727539, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.71039, |
| "step": 328, |
| "tokens/total": 671744, |
| "tokens/train_per_sec_per_gpu": 267.91, |
| "tokens/trainable": 663790 |
| }, |
| { |
| "epoch": 4.090725806451613, |
| "grad_norm": 1.2428795099258423, |
| "learning_rate": 5.146980374689192e-05, |
| "loss": 1.1416620016098022, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.13197, |
| "step": 329, |
| "tokens/total": 673792, |
| "tokens/train_per_sec_per_gpu": 266.28, |
| "tokens/trainable": 665811 |
| }, |
| { |
| "epoch": 4.092741935483871, |
| "grad_norm": 1.2925283908843994, |
| "learning_rate": 5.0916982337482644e-05, |
| "loss": 1.1519179344177246, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.16426, |
| "step": 330, |
| "tokens/total": 675840, |
| "tokens/train_per_sec_per_gpu": 263.97, |
| "tokens/trainable": 667848 |
| }, |
| { |
| "epoch": 4.094758064516129, |
| "grad_norm": 1.506675362586975, |
| "learning_rate": 5.0366130020927624e-05, |
| "loss": 1.4302377700805664, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.17969, |
| "step": 331, |
| "tokens/total": 677888, |
| "tokens/train_per_sec_per_gpu": 266.77, |
| "tokens/trainable": 669886 |
| }, |
| { |
| "epoch": 4.096774193548387, |
| "grad_norm": 1.311647891998291, |
| "learning_rate": 4.981726889609952e-05, |
| "loss": 1.3400213718414307, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.81913, |
| "step": 332, |
| "tokens/total": 679936, |
| "tokens/train_per_sec_per_gpu": 266.18, |
| "tokens/trainable": 671928 |
| }, |
| { |
| "epoch": 4.098790322580645, |
| "grad_norm": 1.3376080989837646, |
| "learning_rate": 4.9270420981989294e-05, |
| "loss": 1.423518180847168, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.1517, |
| "step": 333, |
| "tokens/total": 681984, |
| "tokens/train_per_sec_per_gpu": 264.49, |
| "tokens/trainable": 673948 |
| }, |
| { |
| "epoch": 4.100806451612903, |
| "grad_norm": 1.3112465143203735, |
| "learning_rate": 4.872560821682256e-05, |
| "loss": 1.1089239120483398, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.03109, |
| "step": 334, |
| "tokens/total": 684032, |
| "tokens/train_per_sec_per_gpu": 260.74, |
| "tokens/trainable": 675973 |
| }, |
| { |
| "epoch": 4.102822580645161, |
| "grad_norm": 1.4535356760025024, |
| "learning_rate": 4.818285245717984e-05, |
| "loss": 1.2431288957595825, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.46644, |
| "step": 335, |
| "tokens/total": 686080, |
| "tokens/train_per_sec_per_gpu": 253.64, |
| "tokens/trainable": 678009 |
| }, |
| { |
| "epoch": 4.104838709677419, |
| "grad_norm": 1.4877009391784668, |
| "learning_rate": 4.764217547711934e-05, |
| "loss": 1.4921081066131592, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.44646, |
| "step": 336, |
| "tokens/total": 688128, |
| "tokens/train_per_sec_per_gpu": 263.79, |
| "tokens/trainable": 680047 |
| }, |
| { |
| "epoch": 4.106854838709677, |
| "grad_norm": 1.486241340637207, |
| "learning_rate": 4.710359896730379e-05, |
| "loss": 1.3672105073928833, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.92439, |
| "step": 337, |
| "tokens/total": 690176, |
| "tokens/train_per_sec_per_gpu": 264.3, |
| "tokens/trainable": 682082 |
| }, |
| { |
| "epoch": 4.108870967741935, |
| "grad_norm": 1.5436185598373413, |
| "learning_rate": 4.656714453412993e-05, |
| "loss": 1.280173659324646, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.59726, |
| "step": 338, |
| "tokens/total": 692224, |
| "tokens/train_per_sec_per_gpu": 263.44, |
| "tokens/trainable": 684104 |
| }, |
| { |
| "epoch": 4.110887096774194, |
| "grad_norm": 1.5693432092666626, |
| "learning_rate": 4.6032833698862044e-05, |
| "loss": 1.5435786247253418, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.68131, |
| "step": 339, |
| "tokens/total": 694272, |
| "tokens/train_per_sec_per_gpu": 266.3, |
| "tokens/trainable": 686136 |
| }, |
| { |
| "epoch": 4.112903225806452, |
| "grad_norm": 1.3754907846450806, |
| "learning_rate": 4.5500687896768256e-05, |
| "loss": 1.0676181316375732, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.90844, |
| "step": 340, |
| "tokens/total": 696320, |
| "tokens/train_per_sec_per_gpu": 269.49, |
| "tokens/trainable": 688177 |
| }, |
| { |
| "epoch": 4.11491935483871, |
| "grad_norm": 1.6923837661743164, |
| "learning_rate": 4.497072847626087e-05, |
| "loss": 1.1597163677215576, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.18903, |
| "step": 341, |
| "tokens/total": 698368, |
| "tokens/train_per_sec_per_gpu": 264.76, |
| "tokens/trainable": 690191 |
| }, |
| { |
| "epoch": 4.116935483870968, |
| "grad_norm": 1.5109210014343262, |
| "learning_rate": 4.444297669803981e-05, |
| "loss": 1.552527904510498, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.7234, |
| "step": 342, |
| "tokens/total": 700416, |
| "tokens/train_per_sec_per_gpu": 259.52, |
| "tokens/trainable": 692221 |
| }, |
| { |
| "epoch": 4.118951612903226, |
| "grad_norm": 1.6301664113998413, |
| "learning_rate": 4.3917453734239566e-05, |
| "loss": 1.6055819988250732, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.98076, |
| "step": 343, |
| "tokens/total": 702464, |
| "tokens/train_per_sec_per_gpu": 253.58, |
| "tokens/trainable": 694233 |
| }, |
| { |
| "epoch": 4.120967741935484, |
| "grad_norm": 1.4239203929901123, |
| "learning_rate": 4.339418066758008e-05, |
| "loss": 1.364980936050415, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.91565, |
| "step": 344, |
| "tokens/total": 704512, |
| "tokens/train_per_sec_per_gpu": 258.35, |
| "tokens/trainable": 696278 |
| }, |
| { |
| "epoch": 4.122983870967742, |
| "grad_norm": 1.6190986633300781, |
| "learning_rate": 4.287317849052075e-05, |
| "loss": 1.7706489562988281, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.87466, |
| "step": 345, |
| "tokens/total": 706560, |
| "tokens/train_per_sec_per_gpu": 261.65, |
| "tokens/trainable": 698318 |
| }, |
| { |
| "epoch": 4.125, |
| "grad_norm": 1.4046826362609863, |
| "learning_rate": 4.235446810441841e-05, |
| "loss": 1.204803705215454, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.3361, |
| "step": 346, |
| "tokens/total": 708608, |
| "tokens/train_per_sec_per_gpu": 258.84, |
| "tokens/trainable": 700346 |
| }, |
| { |
| "epoch": 4.127016129032258, |
| "grad_norm": 1.7797582149505615, |
| "learning_rate": 4.1838070318688604e-05, |
| "loss": 2.0054688453674316, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 7.42958, |
| "step": 347, |
| "tokens/total": 710656, |
| "tokens/train_per_sec_per_gpu": 258.83, |
| "tokens/trainable": 702366 |
| }, |
| { |
| "epoch": 4.129032258064516, |
| "grad_norm": 1.3385303020477295, |
| "learning_rate": 4.132400584997106e-05, |
| "loss": 1.164678931236267, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.20489, |
| "step": 348, |
| "tokens/total": 712704, |
| "tokens/train_per_sec_per_gpu": 255.68, |
| "tokens/trainable": 704367 |
| }, |
| { |
| "epoch": 4.131048387096774, |
| "grad_norm": 1.2760944366455078, |
| "learning_rate": 4.081229532129827e-05, |
| "loss": 1.1424689292907715, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.1345, |
| "step": 349, |
| "tokens/total": 714752, |
| "tokens/train_per_sec_per_gpu": 257.9, |
| "tokens/trainable": 706393 |
| }, |
| { |
| "epoch": 4.133064516129032, |
| "grad_norm": 1.4096364974975586, |
| "learning_rate": 4.030295926126845e-05, |
| "loss": 1.409515380859375, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.09397, |
| "step": 350, |
| "tokens/total": 716800, |
| "tokens/train_per_sec_per_gpu": 257.11, |
| "tokens/trainable": 708399 |
| }, |
| { |
| "epoch": 4.13508064516129, |
| "grad_norm": 1.3151328563690186, |
| "learning_rate": 3.979601810322169e-05, |
| "loss": 1.3166403770446777, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.73087, |
| "step": 351, |
| "tokens/total": 718848, |
| "tokens/train_per_sec_per_gpu": 262.63, |
| "tokens/trainable": 710426 |
| }, |
| { |
| "epoch": 4.137096774193548, |
| "grad_norm": 1.3529253005981445, |
| "learning_rate": 3.929149218442052e-05, |
| "loss": 1.3559751510620117, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.88054, |
| "step": 352, |
| "tokens/total": 720896, |
| "tokens/train_per_sec_per_gpu": 258.26, |
| "tokens/trainable": 712423 |
| }, |
| { |
| "epoch": 4.139112903225806, |
| "grad_norm": 1.3632549047470093, |
| "learning_rate": 3.878940174523371e-05, |
| "loss": 1.2670247554779053, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.55027, |
| "step": 353, |
| "tokens/total": 722944, |
| "tokens/train_per_sec_per_gpu": 261.46, |
| "tokens/trainable": 714448 |
| }, |
| { |
| "epoch": 4.141129032258065, |
| "grad_norm": 1.3360320329666138, |
| "learning_rate": 3.828976692832458e-05, |
| "loss": 1.553146481513977, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.72632, |
| "step": 354, |
| "tokens/total": 724992, |
| "tokens/train_per_sec_per_gpu": 254.34, |
| "tokens/trainable": 716436 |
| }, |
| { |
| "epoch": 4.143145161290323, |
| "grad_norm": 1.5650333166122437, |
| "learning_rate": 3.779260777784263e-05, |
| "loss": 1.3416231870651245, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.82525, |
| "step": 355, |
| "tokens/total": 727040, |
| "tokens/train_per_sec_per_gpu": 205.74, |
| "tokens/trainable": 718064 |
| }, |
| { |
| "epoch": 5.002016129032258, |
| "grad_norm": 1.5316541194915771, |
| "learning_rate": 3.7297944238619706e-05, |
| "loss": 1.6510117053985596, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.21225, |
| "step": 356, |
| "tokens/total": 729088, |
| "tokens/train_per_sec_per_gpu": 257.01, |
| "tokens/trainable": 720105 |
| }, |
| { |
| "epoch": 5.004032258064516, |
| "grad_norm": 1.2902381420135498, |
| "learning_rate": 3.680579615536961e-05, |
| "loss": 1.4203373193740845, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.13852, |
| "step": 357, |
| "tokens/total": 731136, |
| "tokens/train_per_sec_per_gpu": 247.52, |
| "tokens/trainable": 722129 |
| }, |
| { |
| "epoch": 5.006048387096774, |
| "grad_norm": 1.3662035465240479, |
| "learning_rate": 3.631618327189218e-05, |
| "loss": 1.4179635047912598, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.1287, |
| "step": 358, |
| "tokens/total": 733184, |
| "tokens/train_per_sec_per_gpu": 258.81, |
| "tokens/trainable": 724173 |
| }, |
| { |
| "epoch": 5.008064516129032, |
| "grad_norm": 1.3037316799163818, |
| "learning_rate": 3.582912523028101e-05, |
| "loss": 1.352861762046814, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.86848, |
| "step": 359, |
| "tokens/total": 735232, |
| "tokens/train_per_sec_per_gpu": 262.86, |
| "tokens/trainable": 726202 |
| }, |
| { |
| "epoch": 5.01008064516129, |
| "grad_norm": 1.3802729845046997, |
| "learning_rate": 3.534464157013574e-05, |
| "loss": 1.6564579010009766, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.24071, |
| "step": 360, |
| "tokens/total": 737280, |
| "tokens/train_per_sec_per_gpu": 266.73, |
| "tokens/trainable": 728239 |
| }, |
| { |
| "epoch": 5.012096774193548, |
| "grad_norm": 1.6267991065979004, |
| "learning_rate": 3.4862751727777797e-05, |
| "loss": 1.5538551807403564, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.72967, |
| "step": 361, |
| "tokens/total": 739328, |
| "tokens/train_per_sec_per_gpu": 240.84, |
| "tokens/trainable": 730281 |
| }, |
| { |
| "epoch": 5.014112903225806, |
| "grad_norm": 1.455069661140442, |
| "learning_rate": 3.438347503547102e-05, |
| "loss": 1.299963355064392, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.66916, |
| "step": 362, |
| "tokens/total": 741376, |
| "tokens/train_per_sec_per_gpu": 264.41, |
| "tokens/trainable": 732325 |
| }, |
| { |
| "epoch": 5.016129032258065, |
| "grad_norm": 1.1740471124649048, |
| "learning_rate": 3.390683072064594e-05, |
| "loss": 0.7859928607940674, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.19458, |
| "step": 363, |
| "tokens/total": 743424, |
| "tokens/train_per_sec_per_gpu": 261.92, |
| "tokens/trainable": 734363 |
| }, |
| { |
| "epoch": 5.018145161290323, |
| "grad_norm": 1.5127320289611816, |
| "learning_rate": 3.343283790512829e-05, |
| "loss": 1.3409039974212646, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.8225, |
| "step": 364, |
| "tokens/total": 745472, |
| "tokens/train_per_sec_per_gpu": 264.56, |
| "tokens/trainable": 736408 |
| }, |
| { |
| "epoch": 5.020161290322581, |
| "grad_norm": 1.4884353876113892, |
| "learning_rate": 3.296151560437214e-05, |
| "loss": 1.1510839462280273, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.16162, |
| "step": 365, |
| "tokens/total": 747520, |
| "tokens/train_per_sec_per_gpu": 261.75, |
| "tokens/trainable": 738441 |
| }, |
| { |
| "epoch": 5.022177419354839, |
| "grad_norm": 1.6059309244155884, |
| "learning_rate": 3.249288272669691e-05, |
| "loss": 1.377195119857788, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.96377, |
| "step": 366, |
| "tokens/total": 749568, |
| "tokens/train_per_sec_per_gpu": 263.14, |
| "tokens/trainable": 740472 |
| }, |
| { |
| "epoch": 5.024193548387097, |
| "grad_norm": 1.5393133163452148, |
| "learning_rate": 3.202695807252871e-05, |
| "loss": 1.3185768127441406, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.7381, |
| "step": 367, |
| "tokens/total": 751616, |
| "tokens/train_per_sec_per_gpu": 263.8, |
| "tokens/trainable": 742500 |
| }, |
| { |
| "epoch": 5.026209677419355, |
| "grad_norm": 1.5796706676483154, |
| "learning_rate": 3.1563760333646395e-05, |
| "loss": 1.3427810668945312, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.82968, |
| "step": 368, |
| "tokens/total": 753664, |
| "tokens/train_per_sec_per_gpu": 261.52, |
| "tokens/trainable": 744527 |
| }, |
| { |
| "epoch": 5.028225806451613, |
| "grad_norm": 1.6402651071548462, |
| "learning_rate": 3.110330809243134e-05, |
| "loss": 1.2144150733947754, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.36832, |
| "step": 369, |
| "tokens/total": 755712, |
| "tokens/train_per_sec_per_gpu": 261.29, |
| "tokens/trainable": 746548 |
| }, |
| { |
| "epoch": 5.030241935483871, |
| "grad_norm": 1.6114401817321777, |
| "learning_rate": 3.064561982112232e-05, |
| "loss": 1.4515684843063354, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.26981, |
| "step": 370, |
| "tokens/total": 757760, |
| "tokens/train_per_sec_per_gpu": 266.07, |
| "tokens/trainable": 748593 |
| }, |
| { |
| "epoch": 5.032258064516129, |
| "grad_norm": 1.5494929552078247, |
| "learning_rate": 3.0190713881074105e-05, |
| "loss": 1.3978958129882812, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.04668, |
| "step": 371, |
| "tokens/total": 759808, |
| "tokens/train_per_sec_per_gpu": 262.06, |
| "tokens/trainable": 750622 |
| }, |
| { |
| "epoch": 5.034274193548387, |
| "grad_norm": 1.4975247383117676, |
| "learning_rate": 2.9738608522021173e-05, |
| "loss": 1.3401731252670288, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.8197, |
| "step": 372, |
| "tokens/total": 761856, |
| "tokens/train_per_sec_per_gpu": 266.56, |
| "tokens/trainable": 752664 |
| }, |
| { |
| "epoch": 5.036290322580645, |
| "grad_norm": 1.4258899688720703, |
| "learning_rate": 2.9289321881345254e-05, |
| "loss": 1.0087716579437256, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.74223, |
| "step": 373, |
| "tokens/total": 763904, |
| "tokens/train_per_sec_per_gpu": 262.62, |
| "tokens/trainable": 754695 |
| }, |
| { |
| "epoch": 5.038306451612903, |
| "grad_norm": 1.3442202806472778, |
| "learning_rate": 2.8842871983347998e-05, |
| "loss": 1.0394554138183594, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.82768, |
| "step": 374, |
| "tokens/total": 765952, |
| "tokens/train_per_sec_per_gpu": 265.19, |
| "tokens/trainable": 756738 |
| }, |
| { |
| "epoch": 5.040322580645161, |
| "grad_norm": 1.64373779296875, |
| "learning_rate": 2.8399276738527714e-05, |
| "loss": 1.4822442531585693, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.40282, |
| "step": 375, |
| "tokens/total": 768000, |
| "tokens/train_per_sec_per_gpu": 263.71, |
| "tokens/trainable": 758772 |
| }, |
| { |
| "epoch": 5.042338709677419, |
| "grad_norm": 1.4457206726074219, |
| "learning_rate": 2.795855394286081e-05, |
| "loss": 1.1947224140167236, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.30264, |
| "step": 376, |
| "tokens/total": 770048, |
| "tokens/train_per_sec_per_gpu": 265.63, |
| "tokens/trainable": 760817 |
| }, |
| { |
| "epoch": 5.044354838709677, |
| "grad_norm": 1.353393316268921, |
| "learning_rate": 2.7520721277088024e-05, |
| "loss": 1.000101923942566, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.71856, |
| "step": 377, |
| "tokens/total": 772096, |
| "tokens/train_per_sec_per_gpu": 263.99, |
| "tokens/trainable": 762861 |
| }, |
| { |
| "epoch": 5.046370967741935, |
| "grad_norm": 1.3696867227554321, |
| "learning_rate": 2.7085796306004906e-05, |
| "loss": 1.1429383754730225, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.13597, |
| "step": 378, |
| "tokens/total": 774144, |
| "tokens/train_per_sec_per_gpu": 264.49, |
| "tokens/trainable": 764893 |
| }, |
| { |
| "epoch": 5.048387096774194, |
| "grad_norm": 1.4743932485580444, |
| "learning_rate": 2.6653796477757432e-05, |
| "loss": 1.1978074312210083, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.31285, |
| "step": 379, |
| "tokens/total": 776192, |
| "tokens/train_per_sec_per_gpu": 256.19, |
| "tokens/trainable": 766940 |
| }, |
| { |
| "epoch": 5.050403225806452, |
| "grad_norm": 1.4643374681472778, |
| "learning_rate": 2.6224739123141684e-05, |
| "loss": 1.512551188468933, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.53829, |
| "step": 380, |
| "tokens/total": 778240, |
| "tokens/train_per_sec_per_gpu": 268.35, |
| "tokens/trainable": 768987 |
| }, |
| { |
| "epoch": 5.05241935483871, |
| "grad_norm": 1.456166386604309, |
| "learning_rate": 2.5798641454908944e-05, |
| "loss": 1.212234616279602, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.36099, |
| "step": 381, |
| "tokens/total": 780288, |
| "tokens/train_per_sec_per_gpu": 268.08, |
| "tokens/trainable": 771017 |
| }, |
| { |
| "epoch": 5.054435483870968, |
| "grad_norm": 1.4265928268432617, |
| "learning_rate": 2.537552056707483e-05, |
| "loss": 1.1896257400512695, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.28585, |
| "step": 382, |
| "tokens/total": 782336, |
| "tokens/train_per_sec_per_gpu": 268.81, |
| "tokens/trainable": 773063 |
| }, |
| { |
| "epoch": 5.056451612903226, |
| "grad_norm": 1.5509512424468994, |
| "learning_rate": 2.4955393434233754e-05, |
| "loss": 0.8469423055648804, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.3325, |
| "step": 383, |
| "tokens/total": 784384, |
| "tokens/train_per_sec_per_gpu": 260.74, |
| "tokens/trainable": 775084 |
| }, |
| { |
| "epoch": 5.058467741935484, |
| "grad_norm": 1.5296618938446045, |
| "learning_rate": 2.45382769108779e-05, |
| "loss": 1.3008588552474976, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.67245, |
| "step": 384, |
| "tokens/total": 786432, |
| "tokens/train_per_sec_per_gpu": 260.72, |
| "tokens/trainable": 777109 |
| }, |
| { |
| "epoch": 5.060483870967742, |
| "grad_norm": 1.67258882522583, |
| "learning_rate": 2.4124187730720917e-05, |
| "loss": 1.3961818218231201, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.03975, |
| "step": 385, |
| "tokens/total": 788480, |
| "tokens/train_per_sec_per_gpu": 264.5, |
| "tokens/trainable": 779147 |
| }, |
| { |
| "epoch": 5.0625, |
| "grad_norm": 1.4869979619979858, |
| "learning_rate": 2.3713142506026786e-05, |
| "loss": 1.1131031513214111, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.04379, |
| "step": 386, |
| "tokens/total": 790528, |
| "tokens/train_per_sec_per_gpu": 262.53, |
| "tokens/trainable": 781185 |
| }, |
| { |
| "epoch": 5.064516129032258, |
| "grad_norm": 1.2601240873336792, |
| "learning_rate": 2.3305157726943327e-05, |
| "loss": 0.9813694357872009, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.66811, |
| "step": 387, |
| "tokens/total": 792576, |
| "tokens/train_per_sec_per_gpu": 263.69, |
| "tokens/trainable": 783227 |
| }, |
| { |
| "epoch": 5.066532258064516, |
| "grad_norm": 1.640095829963684, |
| "learning_rate": 2.290024976084052e-05, |
| "loss": 1.22378671169281, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.40004, |
| "step": 388, |
| "tokens/total": 794624, |
| "tokens/train_per_sec_per_gpu": 262.79, |
| "tokens/trainable": 785258 |
| }, |
| { |
| "epoch": 5.068548387096774, |
| "grad_norm": 1.4825721979141235, |
| "learning_rate": 2.2498434851654126e-05, |
| "loss": 1.0400973558425903, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.82949, |
| "step": 389, |
| "tokens/total": 796672, |
| "tokens/train_per_sec_per_gpu": 261.9, |
| "tokens/trainable": 787284 |
| }, |
| { |
| "epoch": 5.070564516129032, |
| "grad_norm": 1.561867117881775, |
| "learning_rate": 2.209972911923377e-05, |
| "loss": 0.9926523566246033, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 2.69838, |
| "step": 390, |
| "tokens/total": 798720, |
| "tokens/train_per_sec_per_gpu": 262.8, |
| "tokens/trainable": 789319 |
| }, |
| { |
| "epoch": 5.07258064516129, |
| "grad_norm": 1.9464138746261597, |
| "learning_rate": 2.170414855869647e-05, |
| "loss": 1.26716148853302, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.55076, |
| "step": 391, |
| "tokens/total": 800768, |
| "tokens/train_per_sec_per_gpu": 265.31, |
| "tokens/trainable": 791355 |
| }, |
| { |
| "epoch": 5.074596774193548, |
| "grad_norm": 1.5921978950500488, |
| "learning_rate": 2.1311709039784734e-05, |
| "loss": 1.1754467487335205, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.23959, |
| "step": 392, |
| "tokens/total": 802816, |
| "tokens/train_per_sec_per_gpu": 258.36, |
| "tokens/trainable": 793389 |
| }, |
| { |
| "epoch": 5.076612903225806, |
| "grad_norm": 1.8206204175949097, |
| "learning_rate": 2.092242630623016e-05, |
| "loss": 1.431692123413086, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.18578, |
| "step": 393, |
| "tokens/total": 804864, |
| "tokens/train_per_sec_per_gpu": 266.26, |
| "tokens/trainable": 795424 |
| }, |
| { |
| "epoch": 5.078629032258065, |
| "grad_norm": 1.725522518157959, |
| "learning_rate": 2.0536315975121544e-05, |
| "loss": 1.7849311828613281, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.95917, |
| "step": 394, |
| "tokens/total": 806912, |
| "tokens/train_per_sec_per_gpu": 270.11, |
| "tokens/trainable": 797452 |
| }, |
| { |
| "epoch": 5.080645161290323, |
| "grad_norm": 1.5005055665969849, |
| "learning_rate": 2.0153393536278653e-05, |
| "loss": 1.2330188751220703, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.43157, |
| "step": 395, |
| "tokens/total": 808960, |
| "tokens/train_per_sec_per_gpu": 267.03, |
| "tokens/trainable": 799477 |
| }, |
| { |
| "epoch": 5.082661290322581, |
| "grad_norm": 1.4876439571380615, |
| "learning_rate": 1.9773674351630545e-05, |
| "loss": 1.4646284580230713, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.32594, |
| "step": 396, |
| "tokens/total": 811008, |
| "tokens/train_per_sec_per_gpu": 265.64, |
| "tokens/trainable": 801522 |
| }, |
| { |
| "epoch": 5.084677419354839, |
| "grad_norm": 1.5197019577026367, |
| "learning_rate": 1.939717365459952e-05, |
| "loss": 1.2707014083862305, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.56335, |
| "step": 397, |
| "tokens/total": 813056, |
| "tokens/train_per_sec_per_gpu": 265.76, |
| "tokens/trainable": 803563 |
| }, |
| { |
| "epoch": 5.086693548387097, |
| "grad_norm": 1.59368097782135, |
| "learning_rate": 1.9023906549489767e-05, |
| "loss": 1.4219883680343628, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.14535, |
| "step": 398, |
| "tokens/total": 815104, |
| "tokens/train_per_sec_per_gpu": 262.05, |
| "tokens/trainable": 805600 |
| }, |
| { |
| "epoch": 5.088709677419355, |
| "grad_norm": 1.5910316705703735, |
| "learning_rate": 1.8653888010881637e-05, |
| "loss": 1.5422284603118896, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.675, |
| "step": 399, |
| "tokens/total": 817152, |
| "tokens/train_per_sec_per_gpu": 262.99, |
| "tokens/trainable": 807629 |
| }, |
| { |
| "epoch": 5.090725806451613, |
| "grad_norm": 1.5295209884643555, |
| "learning_rate": 1.82871328830307e-05, |
| "loss": 1.1745970249176025, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.23684, |
| "step": 400, |
| "tokens/total": 819200, |
| "tokens/train_per_sec_per_gpu": 256.08, |
| "tokens/trainable": 809674 |
| }, |
| { |
| "epoch": 5.092741935483871, |
| "grad_norm": 1.8055143356323242, |
| "learning_rate": 1.7923655879272393e-05, |
| "loss": 1.7367737293243408, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 5.67899, |
| "step": 401, |
| "tokens/total": 821248, |
| "tokens/train_per_sec_per_gpu": 262.84, |
| "tokens/trainable": 811712 |
| }, |
| { |
| "epoch": 5.094758064516129, |
| "grad_norm": 1.5461682081222534, |
| "learning_rate": 1.7563471581431624e-05, |
| "loss": 1.4324666261672974, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.18902, |
| "step": 402, |
| "tokens/total": 823296, |
| "tokens/train_per_sec_per_gpu": 265.47, |
| "tokens/trainable": 813752 |
| }, |
| { |
| "epoch": 5.096774193548387, |
| "grad_norm": 1.5001784563064575, |
| "learning_rate": 1.7206594439237865e-05, |
| "loss": 1.1119897365570068, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.0404, |
| "step": 403, |
| "tokens/total": 825344, |
| "tokens/train_per_sec_per_gpu": 262.3, |
| "tokens/trainable": 815783 |
| }, |
| { |
| "epoch": 5.098790322580645, |
| "grad_norm": 1.5779640674591064, |
| "learning_rate": 1.6853038769745467e-05, |
| "loss": 1.464457392692566, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 4.3252, |
| "step": 404, |
| "tokens/total": 827392, |
| "tokens/train_per_sec_per_gpu": 261.96, |
| "tokens/trainable": 817804 |
| }, |
| { |
| "epoch": 5.100806451612903, |
| "grad_norm": 1.684232473373413, |
| "learning_rate": 1.6502818756759276e-05, |
| "loss": 1.3644180297851562, |
| "memory/device_reserved (GiB)": 8.46, |
| "memory/max_active (GiB)": 4.41, |
| "memory/max_allocated (GiB)": 4.34, |
| "ppl": 3.91344, |
| "step": 405, |
| "tokens/total": 829440, |
| "tokens/train_per_sec_per_gpu": 265.87, |
| "tokens/trainable": 819840 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 496, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 45, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.298203288338432e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|