| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 305, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01639344262295082, | |
| "grad_norm": 5.58112545754218, | |
| "learning_rate": 2.580645161290323e-06, | |
| "loss": 0.9597, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03278688524590164, | |
| "grad_norm": 5.5621177659627685, | |
| "learning_rate": 5.161290322580646e-06, | |
| "loss": 0.9618, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.04918032786885246, | |
| "grad_norm": 5.2002830302431375, | |
| "learning_rate": 7.741935483870968e-06, | |
| "loss": 0.9528, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.06557377049180328, | |
| "grad_norm": 2.429288619179718, | |
| "learning_rate": 1.0322580645161291e-05, | |
| "loss": 0.8743, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.08196721311475409, | |
| "grad_norm": 2.303682448858842, | |
| "learning_rate": 1.2903225806451613e-05, | |
| "loss": 0.8371, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.09836065573770492, | |
| "grad_norm": 3.7903444701868207, | |
| "learning_rate": 1.5483870967741936e-05, | |
| "loss": 0.8396, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.11475409836065574, | |
| "grad_norm": 4.276574856653076, | |
| "learning_rate": 1.806451612903226e-05, | |
| "loss": 0.8132, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.13114754098360656, | |
| "grad_norm": 3.2100580662432443, | |
| "learning_rate": 2.0645161290322582e-05, | |
| "loss": 0.8073, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.14754098360655737, | |
| "grad_norm": 2.400368523717908, | |
| "learning_rate": 2.3225806451612906e-05, | |
| "loss": 0.7532, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.16393442622950818, | |
| "grad_norm": 2.107929978755255, | |
| "learning_rate": 2.5806451612903226e-05, | |
| "loss": 0.7392, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.18032786885245902, | |
| "grad_norm": 1.706605478358681, | |
| "learning_rate": 2.8387096774193552e-05, | |
| "loss": 0.7263, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.19672131147540983, | |
| "grad_norm": 1.4882561058595487, | |
| "learning_rate": 3.096774193548387e-05, | |
| "loss": 0.709, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.21311475409836064, | |
| "grad_norm": 1.2623324140261931, | |
| "learning_rate": 3.3548387096774195e-05, | |
| "loss": 0.6926, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.22950819672131148, | |
| "grad_norm": 1.0042413251607623, | |
| "learning_rate": 3.612903225806452e-05, | |
| "loss": 0.6882, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.2459016393442623, | |
| "grad_norm": 1.2054555229493642, | |
| "learning_rate": 3.870967741935484e-05, | |
| "loss": 0.6705, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.26229508196721313, | |
| "grad_norm": 1.404103110949475, | |
| "learning_rate": 4.1290322580645165e-05, | |
| "loss": 0.6702, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.2786885245901639, | |
| "grad_norm": 1.4484944472411994, | |
| "learning_rate": 4.387096774193548e-05, | |
| "loss": 0.6655, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.29508196721311475, | |
| "grad_norm": 0.9845835703253549, | |
| "learning_rate": 4.645161290322581e-05, | |
| "loss": 0.6602, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.3114754098360656, | |
| "grad_norm": 1.8228708849031587, | |
| "learning_rate": 4.9032258064516135e-05, | |
| "loss": 0.6586, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.32786885245901637, | |
| "grad_norm": 1.0073875744998761, | |
| "learning_rate": 5.161290322580645e-05, | |
| "loss": 0.6283, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3442622950819672, | |
| "grad_norm": 2.1477016621670657, | |
| "learning_rate": 5.4193548387096774e-05, | |
| "loss": 0.645, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.36065573770491804, | |
| "grad_norm": 1.4144920525799276, | |
| "learning_rate": 5.6774193548387104e-05, | |
| "loss": 0.6402, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.3770491803278688, | |
| "grad_norm": 1.7423289786096712, | |
| "learning_rate": 5.935483870967743e-05, | |
| "loss": 0.6422, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.39344262295081966, | |
| "grad_norm": 1.5173523064275696, | |
| "learning_rate": 6.193548387096774e-05, | |
| "loss": 0.6368, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.4098360655737705, | |
| "grad_norm": 1.400015090802773, | |
| "learning_rate": 6.451612903225807e-05, | |
| "loss": 0.6331, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.4262295081967213, | |
| "grad_norm": 1.4981731922098314, | |
| "learning_rate": 6.709677419354839e-05, | |
| "loss": 0.6154, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.4426229508196721, | |
| "grad_norm": 1.3910445077945457, | |
| "learning_rate": 6.967741935483871e-05, | |
| "loss": 0.6248, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.45901639344262296, | |
| "grad_norm": 2.287234175350141, | |
| "learning_rate": 7.225806451612904e-05, | |
| "loss": 0.6323, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.47540983606557374, | |
| "grad_norm": 1.8671369685906136, | |
| "learning_rate": 7.483870967741936e-05, | |
| "loss": 0.6233, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.4918032786885246, | |
| "grad_norm": 1.465283599887322, | |
| "learning_rate": 7.741935483870968e-05, | |
| "loss": 0.6156, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5081967213114754, | |
| "grad_norm": 2.013369833154031, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6138, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.5245901639344263, | |
| "grad_norm": 1.4445848293844874, | |
| "learning_rate": 7.99973707985831e-05, | |
| "loss": 0.6199, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5409836065573771, | |
| "grad_norm": 2.008002806928902, | |
| "learning_rate": 7.998948353996737e-05, | |
| "loss": 0.6296, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.5573770491803278, | |
| "grad_norm": 1.6006692484547813, | |
| "learning_rate": 7.99763392610124e-05, | |
| "loss": 0.6103, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.5737704918032787, | |
| "grad_norm": 1.7952820693638532, | |
| "learning_rate": 7.995793968966601e-05, | |
| "loss": 0.6189, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.5901639344262295, | |
| "grad_norm": 1.260255034241788, | |
| "learning_rate": 7.993428724473719e-05, | |
| "loss": 0.604, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.6065573770491803, | |
| "grad_norm": 2.918515643648626, | |
| "learning_rate": 7.990538503557801e-05, | |
| "loss": 0.6133, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.6229508196721312, | |
| "grad_norm": 1.5936203819891444, | |
| "learning_rate": 7.987123686167492e-05, | |
| "loss": 0.6041, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.639344262295082, | |
| "grad_norm": 3.246485277477806, | |
| "learning_rate": 7.98318472121493e-05, | |
| "loss": 0.6172, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 2.818701756393196, | |
| "learning_rate": 7.978722126516727e-05, | |
| "loss": 0.6167, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6721311475409836, | |
| "grad_norm": 1.7221453757855867, | |
| "learning_rate": 7.973736488725896e-05, | |
| "loss": 0.5973, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.6885245901639344, | |
| "grad_norm": 1.802719546933628, | |
| "learning_rate": 7.968228463254735e-05, | |
| "loss": 0.6016, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.7049180327868853, | |
| "grad_norm": 1.471860307081128, | |
| "learning_rate": 7.962198774188664e-05, | |
| "loss": 0.5971, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.7213114754098361, | |
| "grad_norm": 1.2373741203327389, | |
| "learning_rate": 7.955648214191034e-05, | |
| "loss": 0.5889, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.7377049180327869, | |
| "grad_norm": 1.261142820035152, | |
| "learning_rate": 7.948577644398924e-05, | |
| "loss": 0.5849, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.7540983606557377, | |
| "grad_norm": 2.0142144188297166, | |
| "learning_rate": 7.940987994309944e-05, | |
| "loss": 0.5871, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.7704918032786885, | |
| "grad_norm": 1.1871765895293158, | |
| "learning_rate": 7.932880261660031e-05, | |
| "loss": 0.577, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.7868852459016393, | |
| "grad_norm": 1.2630851815058795, | |
| "learning_rate": 7.924255512292291e-05, | |
| "loss": 0.5856, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.8032786885245902, | |
| "grad_norm": 1.6849343569275448, | |
| "learning_rate": 7.91511488001689e-05, | |
| "loss": 0.5931, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.819672131147541, | |
| "grad_norm": 0.9859431187761973, | |
| "learning_rate": 7.905459566461993e-05, | |
| "loss": 0.571, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8360655737704918, | |
| "grad_norm": 1.7623450712427522, | |
| "learning_rate": 7.895290840915803e-05, | |
| "loss": 0.5915, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.8524590163934426, | |
| "grad_norm": 1.2069052100241535, | |
| "learning_rate": 7.884610040159704e-05, | |
| "loss": 0.5879, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.8688524590163934, | |
| "grad_norm": 1.2236642927441974, | |
| "learning_rate": 7.873418568292517e-05, | |
| "loss": 0.5779, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.8852459016393442, | |
| "grad_norm": 5.906328363997742, | |
| "learning_rate": 7.861717896545928e-05, | |
| "loss": 0.6193, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.9016393442622951, | |
| "grad_norm": 2.8517859472775013, | |
| "learning_rate": 7.849509563091073e-05, | |
| "loss": 0.6092, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.9180327868852459, | |
| "grad_norm": 1.6402739036234257, | |
| "learning_rate": 7.836795172836335e-05, | |
| "loss": 0.5895, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.9344262295081968, | |
| "grad_norm": 1.071478048704771, | |
| "learning_rate": 7.823576397216354e-05, | |
| "loss": 0.5821, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.9508196721311475, | |
| "grad_norm": 2.6529573722112563, | |
| "learning_rate": 7.809854973972311e-05, | |
| "loss": 0.5988, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.9672131147540983, | |
| "grad_norm": 2.1179614996092426, | |
| "learning_rate": 7.795632706923478e-05, | |
| "loss": 0.6166, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.9836065573770492, | |
| "grad_norm": 1.9008009613755237, | |
| "learning_rate": 7.78091146573009e-05, | |
| "loss": 0.5992, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.894221127540697, | |
| "learning_rate": 7.765693185647557e-05, | |
| "loss": 0.5826, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.0163934426229508, | |
| "grad_norm": 1.708613207423494, | |
| "learning_rate": 7.749979867272055e-05, | |
| "loss": 0.5692, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.0327868852459017, | |
| "grad_norm": 1.8649041648138827, | |
| "learning_rate": 7.733773576277533e-05, | |
| "loss": 0.5729, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.0491803278688525, | |
| "grad_norm": 1.41612070780247, | |
| "learning_rate": 7.71707644314415e-05, | |
| "loss": 0.5579, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.0655737704918034, | |
| "grad_norm": 1.9035698103601548, | |
| "learning_rate": 7.699890662878214e-05, | |
| "loss": 0.5545, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.0819672131147542, | |
| "grad_norm": 4.94355544144476, | |
| "learning_rate": 7.682218494723615e-05, | |
| "loss": 0.5625, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.098360655737705, | |
| "grad_norm": 1.4759904398074293, | |
| "learning_rate": 7.66406226186483e-05, | |
| "loss": 0.5791, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.1147540983606556, | |
| "grad_norm": 1.5331088373110235, | |
| "learning_rate": 7.645424351121519e-05, | |
| "loss": 0.5604, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.1311475409836065, | |
| "grad_norm": 1.289579106246813, | |
| "learning_rate": 7.626307212634745e-05, | |
| "loss": 0.5585, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.1475409836065573, | |
| "grad_norm": 1.3594377937672748, | |
| "learning_rate": 7.606713359544891e-05, | |
| "loss": 0.5563, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.1639344262295082, | |
| "grad_norm": 1.4718812627084297, | |
| "learning_rate": 7.586645367661272e-05, | |
| "loss": 0.5581, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.180327868852459, | |
| "grad_norm": 0.9356040160597947, | |
| "learning_rate": 7.566105875123523e-05, | |
| "loss": 0.5361, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.1967213114754098, | |
| "grad_norm": 0.801036937648957, | |
| "learning_rate": 7.545097582054786e-05, | |
| "loss": 0.555, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.2131147540983607, | |
| "grad_norm": 0.9230784392329119, | |
| "learning_rate": 7.523623250206758e-05, | |
| "loss": 0.5514, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.2295081967213115, | |
| "grad_norm": 1.27446883289555, | |
| "learning_rate": 7.501685702596625e-05, | |
| "loss": 0.5514, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.2459016393442623, | |
| "grad_norm": 1.089059775020075, | |
| "learning_rate": 7.47928782313595e-05, | |
| "loss": 0.5494, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.2622950819672132, | |
| "grad_norm": 0.9532311404835904, | |
| "learning_rate": 7.456432556251552e-05, | |
| "loss": 0.5369, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.278688524590164, | |
| "grad_norm": 1.1037696208490315, | |
| "learning_rate": 7.433122906498436e-05, | |
| "loss": 0.5454, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.2950819672131146, | |
| "grad_norm": 1.0044629833456924, | |
| "learning_rate": 7.409361938164811e-05, | |
| "loss": 0.5536, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.3114754098360657, | |
| "grad_norm": 1.139817816380103, | |
| "learning_rate": 7.385152774869257e-05, | |
| "loss": 0.5451, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.3278688524590163, | |
| "grad_norm": 0.9956186270275268, | |
| "learning_rate": 7.360498599150095e-05, | |
| "loss": 0.5444, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.3442622950819672, | |
| "grad_norm": 1.4777189178807848, | |
| "learning_rate": 7.335402652047011e-05, | |
| "loss": 0.5482, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.360655737704918, | |
| "grad_norm": 1.0684258214405637, | |
| "learning_rate": 7.309868232674992e-05, | |
| "loss": 0.5425, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.3770491803278688, | |
| "grad_norm": 1.045469949816082, | |
| "learning_rate": 7.283898697790615e-05, | |
| "loss": 0.5326, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.3934426229508197, | |
| "grad_norm": 1.2884574146396586, | |
| "learning_rate": 7.257497461350778e-05, | |
| "loss": 0.541, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.4098360655737705, | |
| "grad_norm": 1.2291748562831601, | |
| "learning_rate": 7.23066799406389e-05, | |
| "loss": 0.5463, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.4262295081967213, | |
| "grad_norm": 0.8801073224983168, | |
| "learning_rate": 7.203413822933626e-05, | |
| "loss": 0.5362, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.4426229508196722, | |
| "grad_norm": 0.9677124160991732, | |
| "learning_rate": 7.17573853079525e-05, | |
| "loss": 0.5405, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.459016393442623, | |
| "grad_norm": 1.3289631307779222, | |
| "learning_rate": 7.14764575584463e-05, | |
| "loss": 0.5393, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.4754098360655736, | |
| "grad_norm": 0.6274914847275181, | |
| "learning_rate": 7.119139191159949e-05, | |
| "loss": 0.5329, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.4918032786885247, | |
| "grad_norm": 0.5931193088659492, | |
| "learning_rate": 7.090222584216221e-05, | |
| "loss": 0.5369, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.5081967213114753, | |
| "grad_norm": 1.0213786001116576, | |
| "learning_rate": 7.060899736392643e-05, | |
| "loss": 0.5404, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.5245901639344264, | |
| "grad_norm": 1.2526087387777403, | |
| "learning_rate": 7.031174502472868e-05, | |
| "loss": 0.5445, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.540983606557377, | |
| "grad_norm": 0.9943578918521142, | |
| "learning_rate": 7.001050790138252e-05, | |
| "loss": 0.5315, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.5573770491803278, | |
| "grad_norm": 1.2583383738970864, | |
| "learning_rate": 6.970532559454152e-05, | |
| "loss": 0.5369, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.5737704918032787, | |
| "grad_norm": 0.8624167954684502, | |
| "learning_rate": 6.939623822349339e-05, | |
| "loss": 0.5212, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.5901639344262295, | |
| "grad_norm": 0.8714228266723913, | |
| "learning_rate": 6.908328642088579e-05, | |
| "loss": 0.5325, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.6065573770491803, | |
| "grad_norm": 0.9559074422174284, | |
| "learning_rate": 6.87665113273849e-05, | |
| "loss": 0.5354, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.6229508196721312, | |
| "grad_norm": 1.020705004388792, | |
| "learning_rate": 6.844595458626691e-05, | |
| "loss": 0.5305, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.639344262295082, | |
| "grad_norm": 1.2718848911597436, | |
| "learning_rate": 6.812165833794375e-05, | |
| "loss": 0.534, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.6557377049180326, | |
| "grad_norm": 0.8646369335110455, | |
| "learning_rate": 6.779366521442318e-05, | |
| "loss": 0.5298, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.6721311475409837, | |
| "grad_norm": 1.0431466383974872, | |
| "learning_rate": 6.746201833370447e-05, | |
| "loss": 0.5258, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.6885245901639343, | |
| "grad_norm": 1.2838018594279825, | |
| "learning_rate": 6.712676129411005e-05, | |
| "loss": 0.5363, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.7049180327868854, | |
| "grad_norm": 0.7735313109171527, | |
| "learning_rate": 6.678793816855408e-05, | |
| "loss": 0.5256, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.721311475409836, | |
| "grad_norm": 1.1292517886407496, | |
| "learning_rate": 6.644559349874869e-05, | |
| "loss": 0.5366, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.737704918032787, | |
| "grad_norm": 1.1330330672560118, | |
| "learning_rate": 6.609977228934838e-05, | |
| "loss": 0.522, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.7540983606557377, | |
| "grad_norm": 0.8960440091301521, | |
| "learning_rate": 6.575052000203386e-05, | |
| "loss": 0.5321, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.7704918032786885, | |
| "grad_norm": 0.8001970911153957, | |
| "learning_rate": 6.539788254953557e-05, | |
| "loss": 0.5292, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.7868852459016393, | |
| "grad_norm": 0.826768977068413, | |
| "learning_rate": 6.504190628959798e-05, | |
| "loss": 0.5266, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.8032786885245902, | |
| "grad_norm": 1.047081785970805, | |
| "learning_rate": 6.468263801888546e-05, | |
| "loss": 0.5249, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.819672131147541, | |
| "grad_norm": 1.2237949792369915, | |
| "learning_rate": 6.432012496683032e-05, | |
| "loss": 0.5252, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.8360655737704918, | |
| "grad_norm": 0.5326723853060618, | |
| "learning_rate": 6.395441478942404e-05, | |
| "loss": 0.5286, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.8524590163934427, | |
| "grad_norm": 0.8184475093441959, | |
| "learning_rate": 6.358555556295252e-05, | |
| "loss": 0.5271, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.8688524590163933, | |
| "grad_norm": 1.0030170729677295, | |
| "learning_rate": 6.321359577767574e-05, | |
| "loss": 0.5355, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.8852459016393444, | |
| "grad_norm": 0.8438447158790536, | |
| "learning_rate": 6.283858433145345e-05, | |
| "loss": 0.5256, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.901639344262295, | |
| "grad_norm": 0.6775475091945453, | |
| "learning_rate": 6.246057052331693e-05, | |
| "loss": 0.524, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.918032786885246, | |
| "grad_norm": 0.5277047106995992, | |
| "learning_rate": 6.207960404698818e-05, | |
| "loss": 0.5243, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.9344262295081966, | |
| "grad_norm": 0.5241956608413472, | |
| "learning_rate": 6.169573498434717e-05, | |
| "loss": 0.5257, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.9508196721311475, | |
| "grad_norm": 0.6035995248402194, | |
| "learning_rate": 6.130901379884808e-05, | |
| "loss": 0.5202, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.9672131147540983, | |
| "grad_norm": 0.779564589475285, | |
| "learning_rate": 6.091949132888534e-05, | |
| "loss": 0.527, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.9836065573770492, | |
| "grad_norm": 1.022288473196731, | |
| "learning_rate": 6.052721878111045e-05, | |
| "loss": 0.5281, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.2764091263174406, | |
| "learning_rate": 6.0132247723700324e-05, | |
| "loss": 0.5128, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.0163934426229506, | |
| "grad_norm": 0.7708344672192273, | |
| "learning_rate": 5.973463007957817e-05, | |
| "loss": 0.4906, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.0327868852459017, | |
| "grad_norm": 0.7977537199838536, | |
| "learning_rate": 5.933441811958764e-05, | |
| "loss": 0.4939, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.0491803278688523, | |
| "grad_norm": 1.0793702753193162, | |
| "learning_rate": 5.8931664455621345e-05, | |
| "loss": 0.4952, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.0655737704918034, | |
| "grad_norm": 0.8986778009783392, | |
| "learning_rate": 5.8526422033704494e-05, | |
| "loss": 0.4892, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.081967213114754, | |
| "grad_norm": 1.030605766210909, | |
| "learning_rate": 5.811874412703457e-05, | |
| "loss": 0.4903, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.098360655737705, | |
| "grad_norm": 1.0660924489319643, | |
| "learning_rate": 5.770868432897809e-05, | |
| "loss": 0.4874, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.1147540983606556, | |
| "grad_norm": 0.940262946228767, | |
| "learning_rate": 5.729629654602513e-05, | |
| "loss": 0.4829, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.1311475409836067, | |
| "grad_norm": 0.9888397367489762, | |
| "learning_rate": 5.6881634990702855e-05, | |
| "loss": 0.4918, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.1475409836065573, | |
| "grad_norm": 0.8484560909641439, | |
| "learning_rate": 5.646475417444873e-05, | |
| "loss": 0.4901, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.1639344262295084, | |
| "grad_norm": 0.521192253144461, | |
| "learning_rate": 5.6045708900444364e-05, | |
| "loss": 0.4949, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.180327868852459, | |
| "grad_norm": 0.771777999740359, | |
| "learning_rate": 5.5624554256411175e-05, | |
| "loss": 0.4811, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.19672131147541, | |
| "grad_norm": 0.8227491310955285, | |
| "learning_rate": 5.520134560736852e-05, | |
| "loss": 0.4885, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.2131147540983607, | |
| "grad_norm": 0.5992373916210076, | |
| "learning_rate": 5.477613858835536e-05, | |
| "loss": 0.4872, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.2295081967213113, | |
| "grad_norm": 0.5052047355472387, | |
| "learning_rate": 5.434898909711656e-05, | |
| "loss": 0.4805, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.2459016393442623, | |
| "grad_norm": 0.7157168717143663, | |
| "learning_rate": 5.3919953286754477e-05, | |
| "loss": 0.4825, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.262295081967213, | |
| "grad_norm": 0.5410573992811272, | |
| "learning_rate": 5.348908755834716e-05, | |
| "loss": 0.4834, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.278688524590164, | |
| "grad_norm": 0.4166121333458553, | |
| "learning_rate": 5.305644855353378e-05, | |
| "loss": 0.4833, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.2950819672131146, | |
| "grad_norm": 0.5233638295240889, | |
| "learning_rate": 5.262209314706856e-05, | |
| "loss": 0.4844, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.3114754098360657, | |
| "grad_norm": 0.4090985726833338, | |
| "learning_rate": 5.218607843934401e-05, | |
| "loss": 0.487, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.3278688524590163, | |
| "grad_norm": 0.363420954496579, | |
| "learning_rate": 5.1748461748884476e-05, | |
| "loss": 0.4774, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.3442622950819674, | |
| "grad_norm": 0.4690278843251207, | |
| "learning_rate": 5.130930060481113e-05, | |
| "loss": 0.4811, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.360655737704918, | |
| "grad_norm": 0.46496866291592687, | |
| "learning_rate": 5.0868652739279045e-05, | |
| "loss": 0.4799, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.3770491803278686, | |
| "grad_norm": 0.4067588783945542, | |
| "learning_rate": 5.042657607988787e-05, | |
| "loss": 0.4884, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.3934426229508197, | |
| "grad_norm": 0.3249832363624369, | |
| "learning_rate": 4.9983128742066553e-05, | |
| "loss": 0.4819, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.4098360655737707, | |
| "grad_norm": 0.3155267675853147, | |
| "learning_rate": 4.953836902143354e-05, | |
| "loss": 0.4817, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.4262295081967213, | |
| "grad_norm": 0.30276002082546505, | |
| "learning_rate": 4.909235538613321e-05, | |
| "loss": 0.4817, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.442622950819672, | |
| "grad_norm": 0.3578858890853383, | |
| "learning_rate": 4.864514646914968e-05, | |
| "loss": 0.4855, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.459016393442623, | |
| "grad_norm": 0.28701615085250826, | |
| "learning_rate": 4.819680106059884e-05, | |
| "loss": 0.4829, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.4754098360655736, | |
| "grad_norm": 0.37224620973373557, | |
| "learning_rate": 4.7747378099999866e-05, | |
| "loss": 0.491, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.4918032786885247, | |
| "grad_norm": 0.34012579144317995, | |
| "learning_rate": 4.729693666852699e-05, | |
| "loss": 0.4884, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.5081967213114753, | |
| "grad_norm": 0.3774404172171865, | |
| "learning_rate": 4.684553598124272e-05, | |
| "loss": 0.4851, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.5245901639344264, | |
| "grad_norm": 0.3549426068916513, | |
| "learning_rate": 4.639323537931339e-05, | |
| "loss": 0.4819, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.540983606557377, | |
| "grad_norm": 0.39031877291102196, | |
| "learning_rate": 4.5940094322208156e-05, | |
| "loss": 0.4905, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.557377049180328, | |
| "grad_norm": 0.295463066655204, | |
| "learning_rate": 4.5486172379882495e-05, | |
| "loss": 0.4765, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.5737704918032787, | |
| "grad_norm": 0.29781720638326936, | |
| "learning_rate": 4.5031529224947106e-05, | |
| "loss": 0.4852, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.5901639344262293, | |
| "grad_norm": 0.2852567559187638, | |
| "learning_rate": 4.4576224624823334e-05, | |
| "loss": 0.4836, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.6065573770491803, | |
| "grad_norm": 0.239161233813967, | |
| "learning_rate": 4.412031843388618e-05, | |
| "loss": 0.4889, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.6229508196721314, | |
| "grad_norm": 0.37163233149883945, | |
| "learning_rate": 4.36638705855958e-05, | |
| "loss": 0.481, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.639344262295082, | |
| "grad_norm": 0.35373235092972494, | |
| "learning_rate": 4.320694108461868e-05, | |
| "loss": 0.4738, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.6557377049180326, | |
| "grad_norm": 0.3602393611325291, | |
| "learning_rate": 4.2749589998939366e-05, | |
| "loss": 0.4783, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.6721311475409837, | |
| "grad_norm": 0.33028586314976544, | |
| "learning_rate": 4.2291877451964e-05, | |
| "loss": 0.4807, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.6885245901639343, | |
| "grad_norm": 0.27454726727065487, | |
| "learning_rate": 4.183386361461643e-05, | |
| "loss": 0.4754, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.7049180327868854, | |
| "grad_norm": 0.2201982880819997, | |
| "learning_rate": 4.137560869742816e-05, | |
| "loss": 0.4816, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.721311475409836, | |
| "grad_norm": 0.24024736204090635, | |
| "learning_rate": 4.0917172942623074e-05, | |
| "loss": 0.477, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.737704918032787, | |
| "grad_norm": 0.2774042453950074, | |
| "learning_rate": 4.045861661619796e-05, | |
| "loss": 0.4863, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.7540983606557377, | |
| "grad_norm": 0.25031478188990913, | |
| "learning_rate": 4e-05, | |
| "loss": 0.4806, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.7704918032786887, | |
| "grad_norm": 0.3146343902104566, | |
| "learning_rate": 3.954138338380204e-05, | |
| "loss": 0.4819, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.7868852459016393, | |
| "grad_norm": 0.29476137692022925, | |
| "learning_rate": 3.908282705737694e-05, | |
| "loss": 0.4829, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.80327868852459, | |
| "grad_norm": 0.2781979864605831, | |
| "learning_rate": 3.862439130257184e-05, | |
| "loss": 0.4842, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.819672131147541, | |
| "grad_norm": 0.2812486506160709, | |
| "learning_rate": 3.8166136385383585e-05, | |
| "loss": 0.4848, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 2.836065573770492, | |
| "grad_norm": 0.2443137176068297, | |
| "learning_rate": 3.770812254803601e-05, | |
| "loss": 0.4814, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 2.8524590163934427, | |
| "grad_norm": 0.2139764225453824, | |
| "learning_rate": 3.725041000106065e-05, | |
| "loss": 0.4793, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.8688524590163933, | |
| "grad_norm": 0.2771342287766635, | |
| "learning_rate": 3.679305891538133e-05, | |
| "loss": 0.466, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.8852459016393444, | |
| "grad_norm": 0.26719263531966153, | |
| "learning_rate": 3.6336129414404214e-05, | |
| "loss": 0.4818, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 2.901639344262295, | |
| "grad_norm": 0.21705964839463288, | |
| "learning_rate": 3.5879681566113836e-05, | |
| "loss": 0.4841, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.918032786885246, | |
| "grad_norm": 0.20964241906287742, | |
| "learning_rate": 3.5423775375176686e-05, | |
| "loss": 0.4857, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.9344262295081966, | |
| "grad_norm": 0.19494840468268942, | |
| "learning_rate": 3.4968470775052914e-05, | |
| "loss": 0.475, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.9508196721311473, | |
| "grad_norm": 0.19411352485607267, | |
| "learning_rate": 3.451382762011753e-05, | |
| "loss": 0.4765, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.9672131147540983, | |
| "grad_norm": 0.2389081408757777, | |
| "learning_rate": 3.405990567779185e-05, | |
| "loss": 0.4843, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.9836065573770494, | |
| "grad_norm": 0.19939844528891387, | |
| "learning_rate": 3.3606764620686614e-05, | |
| "loss": 0.4807, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.23959467510788, | |
| "learning_rate": 3.3154464018757284e-05, | |
| "loss": 0.483, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 3.0163934426229506, | |
| "grad_norm": 0.20927947022485016, | |
| "learning_rate": 3.270306333147301e-05, | |
| "loss": 0.4458, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 3.0327868852459017, | |
| "grad_norm": 0.2064164028547752, | |
| "learning_rate": 3.225262190000015e-05, | |
| "loss": 0.46, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 3.0491803278688523, | |
| "grad_norm": 0.22391411463107186, | |
| "learning_rate": 3.180319893940117e-05, | |
| "loss": 0.4488, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 3.0655737704918034, | |
| "grad_norm": 0.21718594350542614, | |
| "learning_rate": 3.135485353085033e-05, | |
| "loss": 0.4487, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 3.081967213114754, | |
| "grad_norm": 0.2467699050215421, | |
| "learning_rate": 3.0907644613866795e-05, | |
| "loss": 0.4466, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 3.098360655737705, | |
| "grad_norm": 0.23078659838474802, | |
| "learning_rate": 3.0461630978566473e-05, | |
| "loss": 0.4423, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 3.1147540983606556, | |
| "grad_norm": 0.2370592376095119, | |
| "learning_rate": 3.0016871257933457e-05, | |
| "loss": 0.4438, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.1311475409836067, | |
| "grad_norm": 0.21570100983782378, | |
| "learning_rate": 2.9573423920112155e-05, | |
| "loss": 0.4455, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 3.1475409836065573, | |
| "grad_norm": 0.21407395891268235, | |
| "learning_rate": 2.9131347260720965e-05, | |
| "loss": 0.4463, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 3.1639344262295084, | |
| "grad_norm": 0.20617008048319857, | |
| "learning_rate": 2.8690699395188873e-05, | |
| "loss": 0.4465, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 3.180327868852459, | |
| "grad_norm": 0.2125529545129492, | |
| "learning_rate": 2.825153825111553e-05, | |
| "loss": 0.445, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 3.19672131147541, | |
| "grad_norm": 0.2002948390041686, | |
| "learning_rate": 2.7813921560656003e-05, | |
| "loss": 0.4472, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 3.2131147540983607, | |
| "grad_norm": 0.19399060354592243, | |
| "learning_rate": 2.737790685293145e-05, | |
| "loss": 0.4384, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 3.2295081967213113, | |
| "grad_norm": 0.20413317551543775, | |
| "learning_rate": 2.694355144646622e-05, | |
| "loss": 0.4474, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 3.2459016393442623, | |
| "grad_norm": 0.178524571287303, | |
| "learning_rate": 2.6510912441652855e-05, | |
| "loss": 0.4476, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 3.262295081967213, | |
| "grad_norm": 0.21495762020528217, | |
| "learning_rate": 2.6080046713245527e-05, | |
| "loss": 0.4481, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 3.278688524590164, | |
| "grad_norm": 0.17779692741231456, | |
| "learning_rate": 2.5651010902883457e-05, | |
| "loss": 0.4447, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.2950819672131146, | |
| "grad_norm": 0.19536488579919992, | |
| "learning_rate": 2.5223861411644647e-05, | |
| "loss": 0.4406, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 3.3114754098360657, | |
| "grad_norm": 0.21534013434506138, | |
| "learning_rate": 2.4798654392631502e-05, | |
| "loss": 0.443, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 3.3278688524590163, | |
| "grad_norm": 0.17678888902216489, | |
| "learning_rate": 2.4375445743588832e-05, | |
| "loss": 0.4481, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 3.3442622950819674, | |
| "grad_norm": 0.21881378852102576, | |
| "learning_rate": 2.3954291099555652e-05, | |
| "loss": 0.4484, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 3.360655737704918, | |
| "grad_norm": 0.1911371536452111, | |
| "learning_rate": 2.3535245825551285e-05, | |
| "loss": 0.4407, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 3.3770491803278686, | |
| "grad_norm": 0.1822153835411314, | |
| "learning_rate": 2.3118365009297145e-05, | |
| "loss": 0.4474, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 3.3934426229508197, | |
| "grad_norm": 0.1967826759425408, | |
| "learning_rate": 2.2703703453974883e-05, | |
| "loss": 0.4466, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 3.4098360655737707, | |
| "grad_norm": 0.1787783354685364, | |
| "learning_rate": 2.2291315671021922e-05, | |
| "loss": 0.4417, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 3.4262295081967213, | |
| "grad_norm": 0.18368967889880147, | |
| "learning_rate": 2.1881255872965434e-05, | |
| "loss": 0.4438, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 3.442622950819672, | |
| "grad_norm": 0.16138571200714233, | |
| "learning_rate": 2.1473577966295506e-05, | |
| "loss": 0.4479, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.459016393442623, | |
| "grad_norm": 0.15851838029612553, | |
| "learning_rate": 2.1068335544378672e-05, | |
| "loss": 0.4511, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 3.4754098360655736, | |
| "grad_norm": 0.15179774543882815, | |
| "learning_rate": 2.0665581880412366e-05, | |
| "loss": 0.4447, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 3.4918032786885247, | |
| "grad_norm": 0.15563700688196921, | |
| "learning_rate": 2.026536992042184e-05, | |
| "loss": 0.4456, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 3.5081967213114753, | |
| "grad_norm": 0.14825148109958275, | |
| "learning_rate": 1.9867752276299686e-05, | |
| "loss": 0.4472, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 3.5245901639344264, | |
| "grad_norm": 0.12432896908408256, | |
| "learning_rate": 1.9472781218889565e-05, | |
| "loss": 0.4491, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 3.540983606557377, | |
| "grad_norm": 0.14986131261136762, | |
| "learning_rate": 1.908050867111468e-05, | |
| "loss": 0.447, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 3.557377049180328, | |
| "grad_norm": 0.1280996425182516, | |
| "learning_rate": 1.869098620115194e-05, | |
| "loss": 0.4459, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 3.5737704918032787, | |
| "grad_norm": 0.15181510097669454, | |
| "learning_rate": 1.8304265015652835e-05, | |
| "loss": 0.444, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 3.5901639344262293, | |
| "grad_norm": 0.1402130102169863, | |
| "learning_rate": 1.792039595301183e-05, | |
| "loss": 0.4521, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 3.6065573770491803, | |
| "grad_norm": 0.16291621368769718, | |
| "learning_rate": 1.7539429476683088e-05, | |
| "loss": 0.447, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.6229508196721314, | |
| "grad_norm": 0.13943515675692342, | |
| "learning_rate": 1.7161415668546557e-05, | |
| "loss": 0.4411, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 3.639344262295082, | |
| "grad_norm": 0.1419363610523589, | |
| "learning_rate": 1.6786404222324278e-05, | |
| "loss": 0.442, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 3.6557377049180326, | |
| "grad_norm": 0.13518707011783945, | |
| "learning_rate": 1.641444443704749e-05, | |
| "loss": 0.444, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 3.6721311475409837, | |
| "grad_norm": 0.12940286831893916, | |
| "learning_rate": 1.6045585210575955e-05, | |
| "loss": 0.448, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 3.6885245901639343, | |
| "grad_norm": 0.15612865445371793, | |
| "learning_rate": 1.56798750331697e-05, | |
| "loss": 0.4439, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 3.7049180327868854, | |
| "grad_norm": 0.13228999124818513, | |
| "learning_rate": 1.531736198111456e-05, | |
| "loss": 0.4402, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 3.721311475409836, | |
| "grad_norm": 0.14034136382445142, | |
| "learning_rate": 1.4958093710402036e-05, | |
| "loss": 0.4394, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 3.737704918032787, | |
| "grad_norm": 0.14905661892566827, | |
| "learning_rate": 1.4602117450464453e-05, | |
| "loss": 0.4481, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 3.7540983606557377, | |
| "grad_norm": 0.16177536977164272, | |
| "learning_rate": 1.4249479997966145e-05, | |
| "loss": 0.4457, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 3.7704918032786887, | |
| "grad_norm": 0.12677113397589643, | |
| "learning_rate": 1.3900227710651647e-05, | |
| "loss": 0.4445, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.7868852459016393, | |
| "grad_norm": 0.17192511217282086, | |
| "learning_rate": 1.3554406501251332e-05, | |
| "loss": 0.4459, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 3.80327868852459, | |
| "grad_norm": 0.11920529678713093, | |
| "learning_rate": 1.3212061831445913e-05, | |
| "loss": 0.4382, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 3.819672131147541, | |
| "grad_norm": 0.12712529888330631, | |
| "learning_rate": 1.2873238705889968e-05, | |
| "loss": 0.441, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 3.836065573770492, | |
| "grad_norm": 0.12199439925977827, | |
| "learning_rate": 1.2537981666295536e-05, | |
| "loss": 0.442, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 3.8524590163934427, | |
| "grad_norm": 0.13371713843483143, | |
| "learning_rate": 1.2206334785576822e-05, | |
| "loss": 0.4381, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 3.8688524590163933, | |
| "grad_norm": 0.11420461356422432, | |
| "learning_rate": 1.1878341662056259e-05, | |
| "loss": 0.4442, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 3.8852459016393444, | |
| "grad_norm": 0.11704524464444101, | |
| "learning_rate": 1.1554045413733097e-05, | |
| "loss": 0.4435, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 3.901639344262295, | |
| "grad_norm": 0.11799803709109379, | |
| "learning_rate": 1.123348867261512e-05, | |
| "loss": 0.4444, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 3.918032786885246, | |
| "grad_norm": 0.12095672068157937, | |
| "learning_rate": 1.0916713579114218e-05, | |
| "loss": 0.4425, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 3.9344262295081966, | |
| "grad_norm": 0.10579439481053957, | |
| "learning_rate": 1.0603761776506616e-05, | |
| "loss": 0.4448, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.9508196721311473, | |
| "grad_norm": 0.12927283153010521, | |
| "learning_rate": 1.029467440545849e-05, | |
| "loss": 0.444, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 3.9672131147540983, | |
| "grad_norm": 0.11979578396485492, | |
| "learning_rate": 9.989492098617494e-06, | |
| "loss": 0.4462, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 3.9836065573770494, | |
| "grad_norm": 0.11270512581429702, | |
| "learning_rate": 9.688254975271327e-06, | |
| "loss": 0.4439, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.12212819186751334, | |
| "learning_rate": 9.391002636073589e-06, | |
| "loss": 0.427, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 4.016393442622951, | |
| "grad_norm": 0.15550079584648527, | |
| "learning_rate": 9.0977741578378e-06, | |
| "loss": 0.4269, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 4.032786885245901, | |
| "grad_norm": 0.13688670983262652, | |
| "learning_rate": 8.808608088400522e-06, | |
| "loss": 0.4269, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 4.049180327868853, | |
| "grad_norm": 0.11888941041824816, | |
| "learning_rate": 8.523542441553712e-06, | |
| "loss": 0.4212, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 4.065573770491803, | |
| "grad_norm": 0.13424666760515866, | |
| "learning_rate": 8.242614692047507e-06, | |
| "loss": 0.4288, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 4.081967213114754, | |
| "grad_norm": 0.15658242982421974, | |
| "learning_rate": 7.96586177066375e-06, | |
| "loss": 0.4234, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 4.098360655737705, | |
| "grad_norm": 0.14858315425443683, | |
| "learning_rate": 7.693320059361107e-06, | |
| "loss": 0.4236, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.114754098360656, | |
| "grad_norm": 0.1432177280213629, | |
| "learning_rate": 7.425025386492231e-06, | |
| "loss": 0.4324, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 4.131147540983607, | |
| "grad_norm": 0.1363328212537908, | |
| "learning_rate": 7.161013022093861e-06, | |
| "loss": 0.4305, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 4.147540983606557, | |
| "grad_norm": 0.13831059143305544, | |
| "learning_rate": 6.901317673250089e-06, | |
| "loss": 0.4198, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 4.163934426229508, | |
| "grad_norm": 0.1479605646882516, | |
| "learning_rate": 6.645973479529892e-06, | |
| "loss": 0.4275, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 4.180327868852459, | |
| "grad_norm": 0.12133684415312865, | |
| "learning_rate": 6.395014008499067e-06, | |
| "loss": 0.4286, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 4.19672131147541, | |
| "grad_norm": 0.11552099846590175, | |
| "learning_rate": 6.148472251307436e-06, | |
| "loss": 0.4249, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 4.213114754098361, | |
| "grad_norm": 0.1286344544278559, | |
| "learning_rate": 5.906380618351893e-06, | |
| "loss": 0.4266, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 4.229508196721311, | |
| "grad_norm": 0.12137839353942781, | |
| "learning_rate": 5.668770935015641e-06, | |
| "loss": 0.4193, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 4.245901639344262, | |
| "grad_norm": 0.10879456442085193, | |
| "learning_rate": 5.435674437484491e-06, | |
| "loss": 0.4213, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 4.262295081967213, | |
| "grad_norm": 0.10654250511384006, | |
| "learning_rate": 5.207121768640519e-06, | |
| "loss": 0.415, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.278688524590164, | |
| "grad_norm": 0.11634254635354951, | |
| "learning_rate": 4.983142974033764e-06, | |
| "loss": 0.4278, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 4.295081967213115, | |
| "grad_norm": 0.1188048854614297, | |
| "learning_rate": 4.763767497932419e-06, | |
| "loss": 0.4186, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 4.311475409836065, | |
| "grad_norm": 0.10499559306706431, | |
| "learning_rate": 4.549024179452151e-06, | |
| "loss": 0.4192, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 4.327868852459017, | |
| "grad_norm": 0.10258020778548946, | |
| "learning_rate": 4.338941248764781e-06, | |
| "loss": 0.4221, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 4.344262295081967, | |
| "grad_norm": 0.10174308704204628, | |
| "learning_rate": 4.133546323387281e-06, | |
| "loss": 0.4291, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 4.360655737704918, | |
| "grad_norm": 0.09609276817159343, | |
| "learning_rate": 3.932866404551092e-06, | |
| "loss": 0.4183, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 4.377049180327869, | |
| "grad_norm": 0.11045174230896027, | |
| "learning_rate": 3.7369278736525584e-06, | |
| "loss": 0.4237, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 4.39344262295082, | |
| "grad_norm": 0.09919053798520305, | |
| "learning_rate": 3.545756488784826e-06, | |
| "loss": 0.4247, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 4.409836065573771, | |
| "grad_norm": 0.10170621462280902, | |
| "learning_rate": 3.359377381351707e-06, | |
| "loss": 0.4162, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 4.426229508196721, | |
| "grad_norm": 0.09599940271102718, | |
| "learning_rate": 3.1778150527638664e-06, | |
| "loss": 0.425, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.442622950819672, | |
| "grad_norm": 0.09230359083631827, | |
| "learning_rate": 3.0010933712178735e-06, | |
| "loss": 0.425, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 4.459016393442623, | |
| "grad_norm": 0.09161534788091766, | |
| "learning_rate": 2.8292355685585147e-06, | |
| "loss": 0.431, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 4.475409836065574, | |
| "grad_norm": 0.09181661135879722, | |
| "learning_rate": 2.6622642372246874e-06, | |
| "loss": 0.4202, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 4.491803278688525, | |
| "grad_norm": 0.09201200891820971, | |
| "learning_rate": 2.5002013272794654e-06, | |
| "loss": 0.4199, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 4.508196721311475, | |
| "grad_norm": 0.09965795343039312, | |
| "learning_rate": 2.3430681435244386e-06, | |
| "loss": 0.4268, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 4.524590163934426, | |
| "grad_norm": 0.09118520040005619, | |
| "learning_rate": 2.1908853426990983e-06, | |
| "loss": 0.4251, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 4.540983606557377, | |
| "grad_norm": 0.0928501159245484, | |
| "learning_rate": 2.0436729307652213e-06, | |
| "loss": 0.4213, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 4.557377049180328, | |
| "grad_norm": 0.09183472059851619, | |
| "learning_rate": 1.9014502602769003e-06, | |
| "loss": 0.4245, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 4.573770491803279, | |
| "grad_norm": 0.08782980102005061, | |
| "learning_rate": 1.7642360278364767e-06, | |
| "loss": 0.4233, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 4.590163934426229, | |
| "grad_norm": 0.09375666869538167, | |
| "learning_rate": 1.6320482716366726e-06, | |
| "loss": 0.425, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.60655737704918, | |
| "grad_norm": 0.08785996758165288, | |
| "learning_rate": 1.5049043690892729e-06, | |
| "loss": 0.4145, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 4.622950819672131, | |
| "grad_norm": 0.09413477632311608, | |
| "learning_rate": 1.3828210345407268e-06, | |
| "loss": 0.4216, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 4.639344262295082, | |
| "grad_norm": 0.09447484104691518, | |
| "learning_rate": 1.2658143170748383e-06, | |
| "loss": 0.4201, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 4.655737704918033, | |
| "grad_norm": 0.09229151843804058, | |
| "learning_rate": 1.1538995984029654e-06, | |
| "loss": 0.4163, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 4.672131147540983, | |
| "grad_norm": 0.08568358284035806, | |
| "learning_rate": 1.047091590841971e-06, | |
| "loss": 0.4263, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 4.688524590163935, | |
| "grad_norm": 0.09023433045860603, | |
| "learning_rate": 9.454043353800801e-07, | |
| "loss": 0.4221, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 4.704918032786885, | |
| "grad_norm": 0.0844734735476981, | |
| "learning_rate": 8.488511998311133e-07, | |
| "loss": 0.422, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 4.721311475409836, | |
| "grad_norm": 0.08564054109423068, | |
| "learning_rate": 7.574448770771003e-07, | |
| "loss": 0.4226, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 4.737704918032787, | |
| "grad_norm": 0.08653160576715448, | |
| "learning_rate": 6.711973833997088e-07, | |
| "loss": 0.4179, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 4.754098360655737, | |
| "grad_norm": 0.08383004695421666, | |
| "learning_rate": 5.90120056900565e-07, | |
| "loss": 0.415, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.770491803278689, | |
| "grad_norm": 0.08834395449932624, | |
| "learning_rate": 5.142235560107623e-07, | |
| "loss": 0.4263, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 4.786885245901639, | |
| "grad_norm": 0.08320061631305875, | |
| "learning_rate": 4.4351785808967485e-07, | |
| "loss": 0.4272, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 4.80327868852459, | |
| "grad_norm": 0.08304223152666458, | |
| "learning_rate": 3.780122581133672e-07, | |
| "loss": 0.4223, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 4.8196721311475414, | |
| "grad_norm": 0.08927719387363141, | |
| "learning_rate": 3.177153674526556e-07, | |
| "loss": 0.4262, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 4.836065573770492, | |
| "grad_norm": 0.08297799431829674, | |
| "learning_rate": 2.626351127410498e-07, | |
| "loss": 0.4223, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 4.852459016393443, | |
| "grad_norm": 0.08318865729803561, | |
| "learning_rate": 2.1277873483274148e-07, | |
| "loss": 0.4259, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 4.868852459016393, | |
| "grad_norm": 0.0828464753884451, | |
| "learning_rate": 1.6815278785069944e-07, | |
| "loss": 0.4235, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 4.885245901639344, | |
| "grad_norm": 0.08134276640655788, | |
| "learning_rate": 1.2876313832508314e-07, | |
| "loss": 0.4193, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 4.901639344262295, | |
| "grad_norm": 0.08124195226061579, | |
| "learning_rate": 9.461496442199735e-08, | |
| "loss": 0.417, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 4.918032786885246, | |
| "grad_norm": 0.08525708385313023, | |
| "learning_rate": 6.571275526281451e-08, | |
| "loss": 0.4301, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.934426229508197, | |
| "grad_norm": 0.08475437381287206, | |
| "learning_rate": 4.206031033399338e-08, | |
| "loss": 0.4309, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 4.950819672131147, | |
| "grad_norm": 0.08111110985986, | |
| "learning_rate": 2.366073898761645e-08, | |
| "loss": 0.4143, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 4.967213114754099, | |
| "grad_norm": 0.08973114402396704, | |
| "learning_rate": 1.0516460032641284e-08, | |
| "loss": 0.4301, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 4.983606557377049, | |
| "grad_norm": 0.07995200669593748, | |
| "learning_rate": 2.6292014169104764e-09, | |
| "loss": 0.418, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.0879869455035565, | |
| "learning_rate": 0.0, | |
| "loss": 0.4146, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 305, | |
| "total_flos": 5116723388743680.0, | |
| "train_loss": 0.5099891699728418, | |
| "train_runtime": 4463.8352, | |
| "train_samples_per_second": 34.825, | |
| "train_steps_per_second": 0.068 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 305, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5116723388743680.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |