| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.98235965988409, |
| "global_step": 70500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.02, |
| "learning_rate": 4.965241056446268e-05, |
| "loss": 5.7812, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.04, |
| "learning_rate": 4.9299885781970474e-05, |
| "loss": 3.1232, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.06, |
| "learning_rate": 4.894736099947827e-05, |
| "loss": 2.7376, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.08, |
| "learning_rate": 4.859483621698606e-05, |
| "loss": 2.4571, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.11, |
| "learning_rate": 4.8242311434493845e-05, |
| "loss": 2.2177, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.13, |
| "learning_rate": 4.788978665200164e-05, |
| "loss": 1.9238, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.15, |
| "learning_rate": 4.753726186950943e-05, |
| "loss": 1.7636, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.17, |
| "learning_rate": 4.718473708701722e-05, |
| "loss": 1.5908, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.19, |
| "learning_rate": 4.683221230452501e-05, |
| "loss": 1.4306, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.21, |
| "learning_rate": 4.64796875220328e-05, |
| "loss": 1.2813, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.23, |
| "learning_rate": 4.6127162739540594e-05, |
| "loss": 1.2258, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.25, |
| "learning_rate": 4.577463795704838e-05, |
| "loss": 1.1167, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.27, |
| "learning_rate": 4.542211317455617e-05, |
| "loss": 1.0346, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.3, |
| "learning_rate": 4.5069588392063965e-05, |
| "loss": 0.963, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.32, |
| "learning_rate": 4.471706360957176e-05, |
| "loss": 0.9412, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.34, |
| "learning_rate": 4.4364538827079544e-05, |
| "loss": 0.8301, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.36, |
| "learning_rate": 4.4012014044587336e-05, |
| "loss": 0.8389, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.38, |
| "learning_rate": 4.366019431166011e-05, |
| "loss": 0.8128, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.4, |
| "learning_rate": 4.3307669529167903e-05, |
| "loss": 0.7062, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.42, |
| "learning_rate": 4.295514474667569e-05, |
| "loss": 0.7305, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.44, |
| "learning_rate": 4.260261996418348e-05, |
| "loss": 0.7048, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.47, |
| "learning_rate": 4.2250095181691275e-05, |
| "loss": 0.6352, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.49, |
| "learning_rate": 4.189757039919907e-05, |
| "loss": 0.6491, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.51, |
| "learning_rate": 4.154504561670686e-05, |
| "loss": 0.6416, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.53, |
| "learning_rate": 4.1192520834214646e-05, |
| "loss": 0.618, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.55, |
| "learning_rate": 4.083999605172244e-05, |
| "loss": 0.5734, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.57, |
| "learning_rate": 4.048747126923023e-05, |
| "loss": 0.5764, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.59, |
| "learning_rate": 4.0135651536303005e-05, |
| "loss": 0.6122, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.61, |
| "learning_rate": 3.97831267538108e-05, |
| "loss": 0.4962, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.63, |
| "learning_rate": 3.9430601971318584e-05, |
| "loss": 0.5173, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.66, |
| "learning_rate": 3.907878223839136e-05, |
| "loss": 0.5122, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.68, |
| "learning_rate": 3.872625745589915e-05, |
| "loss": 0.4673, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.7, |
| "learning_rate": 3.8373732673406944e-05, |
| "loss": 0.4879, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.72, |
| "learning_rate": 3.8021207890914736e-05, |
| "loss": 0.4461, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.74, |
| "learning_rate": 3.766868310842253e-05, |
| "loss": 0.4864, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.76, |
| "learning_rate": 3.7316158325930315e-05, |
| "loss": 0.4934, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.78, |
| "learning_rate": 3.69636335434381e-05, |
| "loss": 0.465, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.8, |
| "learning_rate": 3.661110876094589e-05, |
| "loss": 0.4444, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.82, |
| "learning_rate": 3.6258583978453686e-05, |
| "loss": 0.4787, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.85, |
| "learning_rate": 3.590605919596148e-05, |
| "loss": 0.4358, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.87, |
| "learning_rate": 3.555353441346927e-05, |
| "loss": 0.3978, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.89, |
| "learning_rate": 3.5201009630977064e-05, |
| "loss": 0.4472, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.91, |
| "learning_rate": 3.484918989804983e-05, |
| "loss": 0.4205, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.93, |
| "learning_rate": 3.4496665115557624e-05, |
| "loss": 0.394, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.95, |
| "learning_rate": 3.4144845382630405e-05, |
| "loss": 0.3727, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.97, |
| "learning_rate": 3.379232060013819e-05, |
| "loss": 0.4089, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.99, |
| "learning_rate": 3.343979581764598e-05, |
| "loss": 0.3507, |
| "step": 23500 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_bleu": 68.2888, |
| "eval_gen_len": 25.384, |
| "eval_loss": 0.06593693792819977, |
| "eval_runtime": 35.6722, |
| "eval_samples_per_second": 14.017, |
| "eval_steps_per_second": 3.504, |
| "step": 23639 |
| }, |
| { |
| "epoch": 1.02, |
| "learning_rate": 3.308727103515377e-05, |
| "loss": 0.2763, |
| "step": 24000 |
| }, |
| { |
| "epoch": 1.04, |
| "learning_rate": 3.273474625266156e-05, |
| "loss": 0.2929, |
| "step": 24500 |
| }, |
| { |
| "epoch": 1.06, |
| "learning_rate": 3.2383631569299325e-05, |
| "loss": 0.3075, |
| "step": 25000 |
| }, |
| { |
| "epoch": 1.08, |
| "learning_rate": 3.203110678680711e-05, |
| "loss": 0.262, |
| "step": 25500 |
| }, |
| { |
| "epoch": 1.1, |
| "learning_rate": 3.1678582004314904e-05, |
| "loss": 0.2456, |
| "step": 26000 |
| }, |
| { |
| "epoch": 1.12, |
| "learning_rate": 3.1326057221822696e-05, |
| "loss": 0.2446, |
| "step": 26500 |
| }, |
| { |
| "epoch": 1.14, |
| "learning_rate": 3.097353243933049e-05, |
| "loss": 0.2347, |
| "step": 27000 |
| }, |
| { |
| "epoch": 1.16, |
| "learning_rate": 3.062100765683828e-05, |
| "loss": 0.2831, |
| "step": 27500 |
| }, |
| { |
| "epoch": 1.18, |
| "learning_rate": 3.0269187923911053e-05, |
| "loss": 0.2593, |
| "step": 28000 |
| }, |
| { |
| "epoch": 1.21, |
| "learning_rate": 2.9916663141418842e-05, |
| "loss": 0.2421, |
| "step": 28500 |
| }, |
| { |
| "epoch": 1.23, |
| "learning_rate": 2.9564843408491616e-05, |
| "loss": 0.2507, |
| "step": 29000 |
| }, |
| { |
| "epoch": 1.25, |
| "learning_rate": 2.921231862599941e-05, |
| "loss": 0.2614, |
| "step": 29500 |
| }, |
| { |
| "epoch": 1.27, |
| "learning_rate": 2.8859793843507198e-05, |
| "loss": 0.2224, |
| "step": 30000 |
| }, |
| { |
| "epoch": 1.29, |
| "learning_rate": 2.850726906101499e-05, |
| "loss": 0.277, |
| "step": 30500 |
| }, |
| { |
| "epoch": 1.31, |
| "learning_rate": 2.8154744278522784e-05, |
| "loss": 0.2291, |
| "step": 31000 |
| }, |
| { |
| "epoch": 1.33, |
| "learning_rate": 2.7802219496030573e-05, |
| "loss": 0.2347, |
| "step": 31500 |
| }, |
| { |
| "epoch": 1.35, |
| "learning_rate": 2.7450399763103347e-05, |
| "loss": 0.2425, |
| "step": 32000 |
| }, |
| { |
| "epoch": 1.37, |
| "learning_rate": 2.7098580030176125e-05, |
| "loss": 0.2481, |
| "step": 32500 |
| }, |
| { |
| "epoch": 1.4, |
| "learning_rate": 2.6746055247683914e-05, |
| "loss": 0.2437, |
| "step": 33000 |
| }, |
| { |
| "epoch": 1.42, |
| "learning_rate": 2.63935304651917e-05, |
| "loss": 0.2438, |
| "step": 33500 |
| }, |
| { |
| "epoch": 1.44, |
| "learning_rate": 2.6041005682699493e-05, |
| "loss": 0.2456, |
| "step": 34000 |
| }, |
| { |
| "epoch": 1.46, |
| "learning_rate": 2.5688480900207285e-05, |
| "loss": 0.2239, |
| "step": 34500 |
| }, |
| { |
| "epoch": 1.48, |
| "learning_rate": 2.5335956117715075e-05, |
| "loss": 0.2165, |
| "step": 35000 |
| }, |
| { |
| "epoch": 1.5, |
| "learning_rate": 2.4983431335222867e-05, |
| "loss": 0.1948, |
| "step": 35500 |
| }, |
| { |
| "epoch": 1.52, |
| "learning_rate": 2.463090655273066e-05, |
| "loss": 0.2161, |
| "step": 36000 |
| }, |
| { |
| "epoch": 1.54, |
| "learning_rate": 2.427838177023845e-05, |
| "loss": 0.2112, |
| "step": 36500 |
| }, |
| { |
| "epoch": 1.57, |
| "learning_rate": 2.392585698774624e-05, |
| "loss": 0.2025, |
| "step": 37000 |
| }, |
| { |
| "epoch": 1.59, |
| "learning_rate": 2.357333220525403e-05, |
| "loss": 0.2208, |
| "step": 37500 |
| }, |
| { |
| "epoch": 1.61, |
| "learning_rate": 2.322080742276182e-05, |
| "loss": 0.2175, |
| "step": 38000 |
| }, |
| { |
| "epoch": 1.63, |
| "learning_rate": 2.2868282640269613e-05, |
| "loss": 0.2222, |
| "step": 38500 |
| }, |
| { |
| "epoch": 1.65, |
| "learning_rate": 2.2515757857777402e-05, |
| "loss": 0.2122, |
| "step": 39000 |
| }, |
| { |
| "epoch": 1.67, |
| "learning_rate": 2.2163233075285195e-05, |
| "loss": 0.2034, |
| "step": 39500 |
| }, |
| { |
| "epoch": 1.69, |
| "learning_rate": 2.1810708292792984e-05, |
| "loss": 0.1905, |
| "step": 40000 |
| }, |
| { |
| "epoch": 1.71, |
| "learning_rate": 2.1458183510300777e-05, |
| "loss": 0.1869, |
| "step": 40500 |
| }, |
| { |
| "epoch": 1.73, |
| "learning_rate": 2.1105658727808566e-05, |
| "loss": 0.2258, |
| "step": 41000 |
| }, |
| { |
| "epoch": 1.76, |
| "learning_rate": 2.0753133945316355e-05, |
| "loss": 0.1938, |
| "step": 41500 |
| }, |
| { |
| "epoch": 1.78, |
| "learning_rate": 2.0400609162824148e-05, |
| "loss": 0.1924, |
| "step": 42000 |
| }, |
| { |
| "epoch": 1.8, |
| "learning_rate": 2.0048789429896922e-05, |
| "loss": 0.1651, |
| "step": 42500 |
| }, |
| { |
| "epoch": 1.82, |
| "learning_rate": 1.9696969696969697e-05, |
| "loss": 0.2262, |
| "step": 43000 |
| }, |
| { |
| "epoch": 1.84, |
| "learning_rate": 1.934444491447749e-05, |
| "loss": 0.1959, |
| "step": 43500 |
| }, |
| { |
| "epoch": 1.86, |
| "learning_rate": 1.899192013198528e-05, |
| "loss": 0.1973, |
| "step": 44000 |
| }, |
| { |
| "epoch": 1.88, |
| "learning_rate": 1.8639395349493068e-05, |
| "loss": 0.1683, |
| "step": 44500 |
| }, |
| { |
| "epoch": 1.9, |
| "learning_rate": 1.828687056700086e-05, |
| "loss": 0.2026, |
| "step": 45000 |
| }, |
| { |
| "epoch": 1.92, |
| "learning_rate": 1.7934345784508653e-05, |
| "loss": 0.1885, |
| "step": 45500 |
| }, |
| { |
| "epoch": 1.95, |
| "learning_rate": 1.7581821002016442e-05, |
| "loss": 0.1737, |
| "step": 46000 |
| }, |
| { |
| "epoch": 1.97, |
| "learning_rate": 1.722929621952423e-05, |
| "loss": 0.1895, |
| "step": 46500 |
| }, |
| { |
| "epoch": 1.99, |
| "learning_rate": 1.6878181536161995e-05, |
| "loss": 0.1645, |
| "step": 47000 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_bleu": 0.0, |
| "eval_gen_len": 1.0, |
| "eval_loss": 0.00799082312732935, |
| "eval_runtime": 41.137, |
| "eval_samples_per_second": 12.155, |
| "eval_steps_per_second": 3.039, |
| "step": 47278 |
| }, |
| { |
| "epoch": 2.01, |
| "learning_rate": 1.6525656753669784e-05, |
| "loss": 0.1458, |
| "step": 47500 |
| }, |
| { |
| "epoch": 2.03, |
| "learning_rate": 1.6173131971177573e-05, |
| "loss": 0.088, |
| "step": 48000 |
| }, |
| { |
| "epoch": 2.05, |
| "learning_rate": 1.5820607188685366e-05, |
| "loss": 0.0819, |
| "step": 48500 |
| }, |
| { |
| "epoch": 2.07, |
| "learning_rate": 1.546808240619316e-05, |
| "loss": 0.0885, |
| "step": 49000 |
| }, |
| { |
| "epoch": 2.09, |
| "learning_rate": 1.5115557623700946e-05, |
| "loss": 0.099, |
| "step": 49500 |
| }, |
| { |
| "epoch": 2.12, |
| "learning_rate": 1.4763032841208737e-05, |
| "loss": 0.0994, |
| "step": 50000 |
| }, |
| { |
| "epoch": 2.14, |
| "learning_rate": 1.441050805871653e-05, |
| "loss": 0.0941, |
| "step": 50500 |
| }, |
| { |
| "epoch": 2.16, |
| "learning_rate": 1.4057983276224319e-05, |
| "loss": 0.0977, |
| "step": 51000 |
| }, |
| { |
| "epoch": 2.18, |
| "learning_rate": 1.370545849373211e-05, |
| "loss": 0.0892, |
| "step": 51500 |
| }, |
| { |
| "epoch": 2.2, |
| "learning_rate": 1.33529337112399e-05, |
| "loss": 0.0935, |
| "step": 52000 |
| }, |
| { |
| "epoch": 2.22, |
| "learning_rate": 1.3000408928747693e-05, |
| "loss": 0.1018, |
| "step": 52500 |
| }, |
| { |
| "epoch": 2.24, |
| "learning_rate": 1.264788414625548e-05, |
| "loss": 0.078, |
| "step": 53000 |
| }, |
| { |
| "epoch": 2.26, |
| "learning_rate": 1.2295359363763273e-05, |
| "loss": 0.0866, |
| "step": 53500 |
| }, |
| { |
| "epoch": 2.28, |
| "learning_rate": 1.1942834581271064e-05, |
| "loss": 0.0956, |
| "step": 54000 |
| }, |
| { |
| "epoch": 2.31, |
| "learning_rate": 1.1590309798778855e-05, |
| "loss": 0.0853, |
| "step": 54500 |
| }, |
| { |
| "epoch": 2.33, |
| "learning_rate": 1.1237785016286644e-05, |
| "loss": 0.0882, |
| "step": 55000 |
| }, |
| { |
| "epoch": 2.35, |
| "learning_rate": 1.0885260233794437e-05, |
| "loss": 0.0994, |
| "step": 55500 |
| }, |
| { |
| "epoch": 2.37, |
| "learning_rate": 1.0534145550432197e-05, |
| "loss": 0.087, |
| "step": 56000 |
| }, |
| { |
| "epoch": 2.39, |
| "learning_rate": 1.0181620767939986e-05, |
| "loss": 0.0754, |
| "step": 56500 |
| }, |
| { |
| "epoch": 2.41, |
| "learning_rate": 9.829095985447777e-06, |
| "loss": 0.1035, |
| "step": 57000 |
| }, |
| { |
| "epoch": 2.43, |
| "learning_rate": 9.476571202955568e-06, |
| "loss": 0.0855, |
| "step": 57500 |
| }, |
| { |
| "epoch": 2.45, |
| "learning_rate": 9.124046420463359e-06, |
| "loss": 0.0868, |
| "step": 58000 |
| }, |
| { |
| "epoch": 2.47, |
| "learning_rate": 8.772226687536133e-06, |
| "loss": 0.0873, |
| "step": 58500 |
| }, |
| { |
| "epoch": 2.5, |
| "learning_rate": 8.419701905043926e-06, |
| "loss": 0.0849, |
| "step": 59000 |
| }, |
| { |
| "epoch": 2.52, |
| "learning_rate": 8.0678821721167e-06, |
| "loss": 0.0863, |
| "step": 59500 |
| }, |
| { |
| "epoch": 2.54, |
| "learning_rate": 7.71535738962449e-06, |
| "loss": 0.0833, |
| "step": 60000 |
| }, |
| { |
| "epoch": 2.56, |
| "learning_rate": 7.362832607132282e-06, |
| "loss": 0.0988, |
| "step": 60500 |
| }, |
| { |
| "epoch": 2.58, |
| "learning_rate": 7.010307824640072e-06, |
| "loss": 0.0712, |
| "step": 61000 |
| }, |
| { |
| "epoch": 2.6, |
| "learning_rate": 6.657783042147864e-06, |
| "loss": 0.0784, |
| "step": 61500 |
| }, |
| { |
| "epoch": 2.62, |
| "learning_rate": 6.305258259655654e-06, |
| "loss": 0.0871, |
| "step": 62000 |
| }, |
| { |
| "epoch": 2.64, |
| "learning_rate": 5.952733477163445e-06, |
| "loss": 0.0697, |
| "step": 62500 |
| }, |
| { |
| "epoch": 2.67, |
| "learning_rate": 5.600208694671235e-06, |
| "loss": 0.0681, |
| "step": 63000 |
| }, |
| { |
| "epoch": 2.69, |
| "learning_rate": 5.247683912179026e-06, |
| "loss": 0.0763, |
| "step": 63500 |
| }, |
| { |
| "epoch": 2.71, |
| "learning_rate": 4.895159129686817e-06, |
| "loss": 0.0844, |
| "step": 64000 |
| }, |
| { |
| "epoch": 2.73, |
| "learning_rate": 4.542634347194608e-06, |
| "loss": 0.0803, |
| "step": 64500 |
| }, |
| { |
| "epoch": 2.75, |
| "learning_rate": 4.190109564702399e-06, |
| "loss": 0.0743, |
| "step": 65000 |
| }, |
| { |
| "epoch": 2.77, |
| "learning_rate": 3.837584782210189e-06, |
| "loss": 0.079, |
| "step": 65500 |
| }, |
| { |
| "epoch": 2.79, |
| "learning_rate": 3.48505999971798e-06, |
| "loss": 0.0857, |
| "step": 66000 |
| }, |
| { |
| "epoch": 2.81, |
| "learning_rate": 3.1332402667907553e-06, |
| "loss": 0.0706, |
| "step": 66500 |
| }, |
| { |
| "epoch": 2.83, |
| "learning_rate": 2.7807154842985467e-06, |
| "loss": 0.0666, |
| "step": 67000 |
| }, |
| { |
| "epoch": 2.86, |
| "learning_rate": 2.428190701806337e-06, |
| "loss": 0.0659, |
| "step": 67500 |
| }, |
| { |
| "epoch": 2.88, |
| "learning_rate": 2.0756659193141277e-06, |
| "loss": 0.0793, |
| "step": 68000 |
| }, |
| { |
| "epoch": 2.9, |
| "learning_rate": 1.723846186386903e-06, |
| "loss": 0.0578, |
| "step": 68500 |
| }, |
| { |
| "epoch": 2.92, |
| "learning_rate": 1.3713214038946937e-06, |
| "loss": 0.0788, |
| "step": 69000 |
| }, |
| { |
| "epoch": 2.94, |
| "learning_rate": 1.0187966214024847e-06, |
| "loss": 0.0711, |
| "step": 69500 |
| }, |
| { |
| "epoch": 2.96, |
| "learning_rate": 6.662718389102755e-07, |
| "loss": 0.0622, |
| "step": 70000 |
| }, |
| { |
| "epoch": 2.98, |
| "learning_rate": 3.1445210598305064e-07, |
| "loss": 0.0716, |
| "step": 70500 |
| } |
| ], |
| "max_steps": 70917, |
| "num_train_epochs": 3, |
| "total_flos": 1.3984054543269888e+16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|