| { | |
| "best_global_step": 810, | |
| "best_metric": 0.11693229526281357, | |
| "best_model_checkpoint": "saves_stability/ia3/llama-3-8b-instruct/train_copa_1757340208/checkpoint-810", | |
| "epoch": 10.0, | |
| "eval_steps": 45, | |
| "global_step": 900, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05555555555555555, | |
| "grad_norm": 3.600356340408325, | |
| "learning_rate": 2.2222222222222225e-06, | |
| "loss": 0.5535, | |
| "num_input_tokens_seen": 1536, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 3.9554924964904785, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6368, | |
| "num_input_tokens_seen": 3168, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 4.407029628753662, | |
| "learning_rate": 7.777777777777777e-06, | |
| "loss": 0.6389, | |
| "num_input_tokens_seen": 4736, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 3.8984076976776123, | |
| "learning_rate": 1.0555555555555555e-05, | |
| "loss": 0.5581, | |
| "num_input_tokens_seen": 6304, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2777777777777778, | |
| "grad_norm": 4.774829387664795, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.5253, | |
| "num_input_tokens_seen": 7840, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 2.968090057373047, | |
| "learning_rate": 1.6111111111111115e-05, | |
| "loss": 0.4605, | |
| "num_input_tokens_seen": 9408, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3888888888888889, | |
| "grad_norm": 4.563629627227783, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 0.6419, | |
| "num_input_tokens_seen": 10912, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 3.711933135986328, | |
| "learning_rate": 2.1666666666666667e-05, | |
| "loss": 0.7023, | |
| "num_input_tokens_seen": 12448, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.084500312805176, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 0.6395, | |
| "num_input_tokens_seen": 14016, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 0.6499536633491516, | |
| "eval_runtime": 0.5733, | |
| "eval_samples_per_second": 69.77, | |
| "eval_steps_per_second": 17.442, | |
| "num_input_tokens_seen": 14016, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 3.8269877433776855, | |
| "learning_rate": 2.7222222222222223e-05, | |
| "loss": 0.6308, | |
| "num_input_tokens_seen": 15584, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6111111111111112, | |
| "grad_norm": 3.173386573791504, | |
| "learning_rate": 3e-05, | |
| "loss": 0.6346, | |
| "num_input_tokens_seen": 17184, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 4.542137622833252, | |
| "learning_rate": 3.277777777777778e-05, | |
| "loss": 0.6151, | |
| "num_input_tokens_seen": 18752, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7222222222222222, | |
| "grad_norm": 4.822482109069824, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 0.8057, | |
| "num_input_tokens_seen": 20352, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 3.6719212532043457, | |
| "learning_rate": 3.8333333333333334e-05, | |
| "loss": 0.4894, | |
| "num_input_tokens_seen": 21952, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 3.7617292404174805, | |
| "learning_rate": 4.111111111111111e-05, | |
| "loss": 0.4534, | |
| "num_input_tokens_seen": 23456, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 2.934063673019409, | |
| "learning_rate": 4.388888888888889e-05, | |
| "loss": 0.8631, | |
| "num_input_tokens_seen": 25056, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9444444444444444, | |
| "grad_norm": 4.400132179260254, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 0.5899, | |
| "num_input_tokens_seen": 26560, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.6979613304138184, | |
| "learning_rate": 4.9444444444444446e-05, | |
| "loss": 0.7006, | |
| "num_input_tokens_seen": 28096, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.4485316872596741, | |
| "eval_runtime": 0.5736, | |
| "eval_samples_per_second": 69.73, | |
| "eval_steps_per_second": 17.433, | |
| "num_input_tokens_seen": 28096, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0555555555555556, | |
| "grad_norm": 2.279639720916748, | |
| "learning_rate": 4.9996991493233693e-05, | |
| "loss": 0.4051, | |
| "num_input_tokens_seen": 29696, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 0.6512580513954163, | |
| "learning_rate": 4.99847706754774e-05, | |
| "loss": 0.2066, | |
| "num_input_tokens_seen": 31232, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1666666666666667, | |
| "grad_norm": 4.290563583374023, | |
| "learning_rate": 4.9963154107272295e-05, | |
| "loss": 0.1482, | |
| "num_input_tokens_seen": 32768, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.2222222222222223, | |
| "grad_norm": 0.21567487716674805, | |
| "learning_rate": 4.993214991772563e-05, | |
| "loss": 0.0882, | |
| "num_input_tokens_seen": 34304, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.2777777777777777, | |
| "grad_norm": 0.06894425302743912, | |
| "learning_rate": 4.989176976624511e-05, | |
| "loss": 0.0236, | |
| "num_input_tokens_seen": 35872, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.27700164914131165, | |
| "learning_rate": 4.9842028838154285e-05, | |
| "loss": 0.203, | |
| "num_input_tokens_seen": 37408, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3888888888888888, | |
| "grad_norm": 1.5392497777938843, | |
| "learning_rate": 4.978294583898196e-05, | |
| "loss": 0.1256, | |
| "num_input_tokens_seen": 38976, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 1.876647710800171, | |
| "learning_rate": 4.971454298742779e-05, | |
| "loss": 0.1008, | |
| "num_input_tokens_seen": 40576, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.10182231664657593, | |
| "learning_rate": 4.963684600700679e-05, | |
| "loss": 0.1984, | |
| "num_input_tokens_seen": 42144, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 0.1416526734828949, | |
| "eval_runtime": 0.5795, | |
| "eval_samples_per_second": 69.023, | |
| "eval_steps_per_second": 17.256, | |
| "num_input_tokens_seen": 42144, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 0.09817753732204437, | |
| "learning_rate": 4.9549884116375714e-05, | |
| "loss": 0.0695, | |
| "num_input_tokens_seen": 43680, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.6111111111111112, | |
| "grad_norm": 1.252616047859192, | |
| "learning_rate": 4.9453690018345144e-05, | |
| "loss": 0.2439, | |
| "num_input_tokens_seen": 45248, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.037983693182468414, | |
| "learning_rate": 4.934829988758131e-05, | |
| "loss": 0.0273, | |
| "num_input_tokens_seen": 46816, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.7222222222222223, | |
| "grad_norm": 1.8877675533294678, | |
| "learning_rate": 4.923375335700223e-05, | |
| "loss": 0.165, | |
| "num_input_tokens_seen": 48384, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 0.031594209372997284, | |
| "learning_rate": 4.9110093502873476e-05, | |
| "loss": 0.0621, | |
| "num_input_tokens_seen": 49952, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8333333333333335, | |
| "grad_norm": 0.021081136539578438, | |
| "learning_rate": 4.897736682860885e-05, | |
| "loss": 0.0201, | |
| "num_input_tokens_seen": 51520, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 0.7376044988632202, | |
| "learning_rate": 4.883562324728241e-05, | |
| "loss": 0.132, | |
| "num_input_tokens_seen": 53024, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9444444444444444, | |
| "grad_norm": 1.6047018766403198, | |
| "learning_rate": 4.868491606285823e-05, | |
| "loss": 0.0817, | |
| "num_input_tokens_seen": 54592, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.10223466902971268, | |
| "learning_rate": 4.8525301950144894e-05, | |
| "loss": 0.1022, | |
| "num_input_tokens_seen": 56128, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.1372470110654831, | |
| "eval_runtime": 0.5716, | |
| "eval_samples_per_second": 69.977, | |
| "eval_steps_per_second": 17.494, | |
| "num_input_tokens_seen": 56128, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0555555555555554, | |
| "grad_norm": 0.01886976882815361, | |
| "learning_rate": 4.835684093348244e-05, | |
| "loss": 0.0384, | |
| "num_input_tokens_seen": 57696, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.111111111111111, | |
| "grad_norm": 1.9822107553482056, | |
| "learning_rate": 4.817959636416969e-05, | |
| "loss": 0.2179, | |
| "num_input_tokens_seen": 59264, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.1666666666666665, | |
| "grad_norm": 0.015257872641086578, | |
| "learning_rate": 4.7993634896640394e-05, | |
| "loss": 0.0513, | |
| "num_input_tokens_seen": 60864, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 1.1210095882415771, | |
| "learning_rate": 4.779902646339722e-05, | |
| "loss": 0.1016, | |
| "num_input_tokens_seen": 62464, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.2777777777777777, | |
| "grad_norm": 1.548754096031189, | |
| "learning_rate": 4.759584424871302e-05, | |
| "loss": 0.2368, | |
| "num_input_tokens_seen": 64032, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.06785988807678223, | |
| "learning_rate": 4.7384164661109176e-05, | |
| "loss": 0.0743, | |
| "num_input_tokens_seen": 65568, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.388888888888889, | |
| "grad_norm": 0.05626998096704483, | |
| "learning_rate": 4.7164067304621536e-05, | |
| "loss": 0.0286, | |
| "num_input_tokens_seen": 67104, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 0.05990084633231163, | |
| "learning_rate": 4.693563494886455e-05, | |
| "loss": 0.1757, | |
| "num_input_tokens_seen": 68704, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.7318986654281616, | |
| "learning_rate": 4.669895349790502e-05, | |
| "loss": 0.0941, | |
| "num_input_tokens_seen": 70272, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 0.128821462392807, | |
| "eval_runtime": 0.5783, | |
| "eval_samples_per_second": 69.172, | |
| "eval_steps_per_second": 17.293, | |
| "num_input_tokens_seen": 70272, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.5555555555555554, | |
| "grad_norm": 0.4467451572418213, | |
| "learning_rate": 4.645411195795709e-05, | |
| "loss": 0.0419, | |
| "num_input_tokens_seen": 71808, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.611111111111111, | |
| "grad_norm": 0.32919180393218994, | |
| "learning_rate": 4.620120240391065e-05, | |
| "loss": 0.106, | |
| "num_input_tokens_seen": 73408, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.48591506481170654, | |
| "learning_rate": 4.5940319944705736e-05, | |
| "loss": 0.0502, | |
| "num_input_tokens_seen": 74912, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.7222222222222223, | |
| "grad_norm": 1.6752086877822876, | |
| "learning_rate": 4.567156268756594e-05, | |
| "loss": 0.1541, | |
| "num_input_tokens_seen": 76544, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 0.12120076268911362, | |
| "learning_rate": 4.539503170110431e-05, | |
| "loss": 0.0417, | |
| "num_input_tokens_seen": 78112, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.8333333333333335, | |
| "grad_norm": 0.023592006415128708, | |
| "learning_rate": 4.5110830977315556e-05, | |
| "loss": 0.0074, | |
| "num_input_tokens_seen": 79712, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 0.41189005970954895, | |
| "learning_rate": 4.4819067392468944e-05, | |
| "loss": 0.0533, | |
| "num_input_tokens_seen": 81280, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.9444444444444446, | |
| "grad_norm": 0.04254873842000961, | |
| "learning_rate": 4.4519850666916484e-05, | |
| "loss": 0.0351, | |
| "num_input_tokens_seen": 82848, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.04430355131626129, | |
| "learning_rate": 4.4213293323831585e-05, | |
| "loss": 0.0492, | |
| "num_input_tokens_seen": 84352, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.12684686481952667, | |
| "eval_runtime": 0.5786, | |
| "eval_samples_per_second": 69.128, | |
| "eval_steps_per_second": 17.282, | |
| "num_input_tokens_seen": 84352, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.0555555555555554, | |
| "grad_norm": 1.4837942123413086, | |
| "learning_rate": 4.38995106468937e-05, | |
| "loss": 0.038, | |
| "num_input_tokens_seen": 85920, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 3.111111111111111, | |
| "grad_norm": 0.08254540711641312, | |
| "learning_rate": 4.357862063693486e-05, | |
| "loss": 0.0555, | |
| "num_input_tokens_seen": 87520, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.1666666666666665, | |
| "grad_norm": 0.09326854348182678, | |
| "learning_rate": 4.325074396756437e-05, | |
| "loss": 0.0062, | |
| "num_input_tokens_seen": 89088, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 3.2222222222222223, | |
| "grad_norm": 0.025380326434969902, | |
| "learning_rate": 4.2916003939788403e-05, | |
| "loss": 0.0619, | |
| "num_input_tokens_seen": 90688, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.2777777777777777, | |
| "grad_norm": 0.7877873778343201, | |
| "learning_rate": 4.257452643564155e-05, | |
| "loss": 0.0658, | |
| "num_input_tokens_seen": 92160, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 1.8795515298843384, | |
| "learning_rate": 4.22264398708477e-05, | |
| "loss": 0.1432, | |
| "num_input_tokens_seen": 93760, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.388888888888889, | |
| "grad_norm": 0.4649149179458618, | |
| "learning_rate": 4.1871875146528195e-05, | |
| "loss": 0.1452, | |
| "num_input_tokens_seen": 95360, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 0.023714592680335045, | |
| "learning_rate": 4.1510965599975196e-05, | |
| "loss": 0.0706, | |
| "num_input_tokens_seen": 96928, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 1.68911874294281, | |
| "learning_rate": 4.114384695450906e-05, | |
| "loss": 0.1109, | |
| "num_input_tokens_seen": 98464, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "eval_loss": 0.12574629485607147, | |
| "eval_runtime": 0.5791, | |
| "eval_samples_per_second": 69.073, | |
| "eval_steps_per_second": 17.268, | |
| "num_input_tokens_seen": 98464, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 1.7038501501083374, | |
| "learning_rate": 4.077065726843828e-05, | |
| "loss": 0.0583, | |
| "num_input_tokens_seen": 100064, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.611111111111111, | |
| "grad_norm": 0.10626251250505447, | |
| "learning_rate": 4.039153688314145e-05, | |
| "loss": 0.0862, | |
| "num_input_tokens_seen": 101600, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 0.026887105777859688, | |
| "learning_rate": 4.000662837029062e-05, | |
| "loss": 0.142, | |
| "num_input_tokens_seen": 103200, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.7222222222222223, | |
| "grad_norm": 1.1281291246414185, | |
| "learning_rate": 3.961607647823583e-05, | |
| "loss": 0.1242, | |
| "num_input_tokens_seen": 104768, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.7777777777777777, | |
| "grad_norm": 0.7571738958358765, | |
| "learning_rate": 3.9220028077571295e-05, | |
| "loss": 0.0205, | |
| "num_input_tokens_seen": 106304, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.8333333333333335, | |
| "grad_norm": 0.020017391070723534, | |
| "learning_rate": 3.881863210590332e-05, | |
| "loss": 0.0276, | |
| "num_input_tokens_seen": 107904, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.888888888888889, | |
| "grad_norm": 1.8858087062835693, | |
| "learning_rate": 3.841203951184095e-05, | |
| "loss": 0.1725, | |
| "num_input_tokens_seen": 109408, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.9444444444444446, | |
| "grad_norm": 0.029001127928495407, | |
| "learning_rate": 3.8000403198230387e-05, | |
| "loss": 0.011, | |
| "num_input_tokens_seen": 111008, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.015091209672391415, | |
| "learning_rate": 3.75838779646545e-05, | |
| "loss": 0.0217, | |
| "num_input_tokens_seen": 112576, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.1229981929063797, | |
| "eval_runtime": 0.5847, | |
| "eval_samples_per_second": 68.412, | |
| "eval_steps_per_second": 17.103, | |
| "num_input_tokens_seen": 112576, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.055555555555555, | |
| "grad_norm": 0.04395188391208649, | |
| "learning_rate": 3.7162620449219e-05, | |
| "loss": 0.0876, | |
| "num_input_tokens_seen": 114144, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 4.111111111111111, | |
| "grad_norm": 1.225183367729187, | |
| "learning_rate": 3.673678906964727e-05, | |
| "loss": 0.0333, | |
| "num_input_tokens_seen": 115712, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 0.019293656572699547, | |
| "learning_rate": 3.630654396370594e-05, | |
| "loss": 0.084, | |
| "num_input_tokens_seen": 117216, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 0.01881745271384716, | |
| "learning_rate": 3.5872046928983626e-05, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 118816, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.277777777777778, | |
| "grad_norm": 0.612468957901001, | |
| "learning_rate": 3.543346136204545e-05, | |
| "loss": 0.043, | |
| "num_input_tokens_seen": 120352, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 1.3931995630264282, | |
| "learning_rate": 3.499095219698631e-05, | |
| "loss": 0.1121, | |
| "num_input_tokens_seen": 121920, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.388888888888889, | |
| "grad_norm": 1.119995355606079, | |
| "learning_rate": 3.454468584340588e-05, | |
| "loss": 0.0442, | |
| "num_input_tokens_seen": 123456, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 1.9448041915893555, | |
| "learning_rate": 3.409483012382879e-05, | |
| "loss": 0.1374, | |
| "num_input_tokens_seen": 125056, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 1.8364677429199219, | |
| "learning_rate": 3.364155421059342e-05, | |
| "loss": 0.1727, | |
| "num_input_tokens_seen": 126624, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "eval_loss": 0.12207037210464478, | |
| "eval_runtime": 0.5732, | |
| "eval_samples_per_second": 69.781, | |
| "eval_steps_per_second": 17.445, | |
| "num_input_tokens_seen": 126624, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 4.555555555555555, | |
| "grad_norm": 0.2935144603252411, | |
| "learning_rate": 3.318502856223311e-05, | |
| "loss": 0.0387, | |
| "num_input_tokens_seen": 128224, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.611111111111111, | |
| "grad_norm": 0.8400506973266602, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 0.0662, | |
| "num_input_tokens_seen": 129728, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 1.2359319925308228, | |
| "learning_rate": 3.2262915940171376e-05, | |
| "loss": 0.1019, | |
| "num_input_tokens_seen": 131328, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.722222222222222, | |
| "grad_norm": 1.439225196838379, | |
| "learning_rate": 3.1797675735315455e-05, | |
| "loss": 0.0536, | |
| "num_input_tokens_seen": 132896, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.777777777777778, | |
| "grad_norm": 0.030594151467084885, | |
| "learning_rate": 3.132987920262005e-05, | |
| "loss": 0.024, | |
| "num_input_tokens_seen": 134496, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.833333333333333, | |
| "grad_norm": 0.012412482872605324, | |
| "learning_rate": 3.085970226122962e-05, | |
| "loss": 0.0168, | |
| "num_input_tokens_seen": 136064, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 4.888888888888889, | |
| "grad_norm": 0.47157421708106995, | |
| "learning_rate": 3.0387321725463e-05, | |
| "loss": 0.1113, | |
| "num_input_tokens_seen": 137664, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.944444444444445, | |
| "grad_norm": 0.025079967454075813, | |
| "learning_rate": 2.9912915238320754e-05, | |
| "loss": 0.0876, | |
| "num_input_tokens_seen": 139232, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.16793425381183624, | |
| "learning_rate": 2.9436661204680882e-05, | |
| "loss": 0.0202, | |
| "num_input_tokens_seen": 140832, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.1202269047498703, | |
| "eval_runtime": 0.5683, | |
| "eval_samples_per_second": 70.384, | |
| "eval_steps_per_second": 17.596, | |
| "num_input_tokens_seen": 140832, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.055555555555555, | |
| "grad_norm": 0.6793619394302368, | |
| "learning_rate": 2.8958738724208072e-05, | |
| "loss": 0.136, | |
| "num_input_tokens_seen": 142368, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 5.111111111111111, | |
| "grad_norm": 0.25957050919532776, | |
| "learning_rate": 2.8479327524001636e-05, | |
| "loss": 0.0269, | |
| "num_input_tokens_seen": 144000, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.166666666666667, | |
| "grad_norm": 0.20538966357707977, | |
| "learning_rate": 2.7998607891007495e-05, | |
| "loss": 0.0361, | |
| "num_input_tokens_seen": 145632, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 5.222222222222222, | |
| "grad_norm": 1.7816319465637207, | |
| "learning_rate": 2.7516760604219617e-05, | |
| "loss": 0.1392, | |
| "num_input_tokens_seen": 147168, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.277777777777778, | |
| "grad_norm": 1.9632638692855835, | |
| "learning_rate": 2.7033966866696457e-05, | |
| "loss": 0.1823, | |
| "num_input_tokens_seen": 148736, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 0.016877813264727592, | |
| "learning_rate": 2.6550408237417885e-05, | |
| "loss": 0.0291, | |
| "num_input_tokens_seen": 150304, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 5.388888888888889, | |
| "grad_norm": 0.25234395265579224, | |
| "learning_rate": 2.6066266563008267e-05, | |
| "loss": 0.0992, | |
| "num_input_tokens_seen": 151872, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 5.444444444444445, | |
| "grad_norm": 1.0347378253936768, | |
| "learning_rate": 2.5581723909351406e-05, | |
| "loss": 0.0487, | |
| "num_input_tokens_seen": 153472, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 0.35803094506263733, | |
| "learning_rate": 2.5096962493123012e-05, | |
| "loss": 0.052, | |
| "num_input_tokens_seen": 154976, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "eval_loss": 0.12070164829492569, | |
| "eval_runtime": 0.6285, | |
| "eval_samples_per_second": 63.648, | |
| "eval_steps_per_second": 15.912, | |
| "num_input_tokens_seen": 154976, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 0.6016066074371338, | |
| "learning_rate": 2.461216461326642e-05, | |
| "loss": 0.0278, | |
| "num_input_tokens_seen": 156544, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.611111111111111, | |
| "grad_norm": 0.014409479685127735, | |
| "learning_rate": 2.4127512582437485e-05, | |
| "loss": 0.1038, | |
| "num_input_tokens_seen": 158112, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 5.666666666666667, | |
| "grad_norm": 0.6132456660270691, | |
| "learning_rate": 2.364318865844416e-05, | |
| "loss": 0.0933, | |
| "num_input_tokens_seen": 159680, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.722222222222222, | |
| "grad_norm": 0.014162052422761917, | |
| "learning_rate": 2.3159374975706884e-05, | |
| "loss": 0.0035, | |
| "num_input_tokens_seen": 161312, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 5.777777777777778, | |
| "grad_norm": 0.7488720417022705, | |
| "learning_rate": 2.2676253476765196e-05, | |
| "loss": 0.0197, | |
| "num_input_tokens_seen": 162880, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.833333333333333, | |
| "grad_norm": 0.01749660074710846, | |
| "learning_rate": 2.2194005843856636e-05, | |
| "loss": 0.0182, | |
| "num_input_tokens_seen": 164448, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 5.888888888888889, | |
| "grad_norm": 0.8407071232795715, | |
| "learning_rate": 2.1712813430593436e-05, | |
| "loss": 0.055, | |
| "num_input_tokens_seen": 166016, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 5.944444444444445, | |
| "grad_norm": 0.9777589440345764, | |
| "learning_rate": 2.1232857193762924e-05, | |
| "loss": 0.0954, | |
| "num_input_tokens_seen": 167552, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.44680315256118774, | |
| "learning_rate": 2.0754317625276983e-05, | |
| "loss": 0.0191, | |
| "num_input_tokens_seen": 169056, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.12337921559810638, | |
| "eval_runtime": 0.5722, | |
| "eval_samples_per_second": 69.903, | |
| "eval_steps_per_second": 17.476, | |
| "num_input_tokens_seen": 169056, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.055555555555555, | |
| "grad_norm": 0.01602465845644474, | |
| "learning_rate": 2.02773746842965e-05, | |
| "loss": 0.0619, | |
| "num_input_tokens_seen": 170592, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 6.111111111111111, | |
| "grad_norm": 0.03292570635676384, | |
| "learning_rate": 1.980220772955602e-05, | |
| "loss": 0.0953, | |
| "num_input_tokens_seen": 172192, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.166666666666667, | |
| "grad_norm": 1.0759538412094116, | |
| "learning_rate": 1.932899545191433e-05, | |
| "loss": 0.1297, | |
| "num_input_tokens_seen": 173792, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 6.222222222222222, | |
| "grad_norm": 0.1572539508342743, | |
| "learning_rate": 1.8857915807156092e-05, | |
| "loss": 0.0239, | |
| "num_input_tokens_seen": 175360, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.277777777777778, | |
| "grad_norm": 0.03557334840297699, | |
| "learning_rate": 1.838914594906995e-05, | |
| "loss": 0.0273, | |
| "num_input_tokens_seen": 176992, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 6.333333333333333, | |
| "grad_norm": 0.04483241215348244, | |
| "learning_rate": 1.792286216282824e-05, | |
| "loss": 0.0271, | |
| "num_input_tokens_seen": 178592, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 6.388888888888889, | |
| "grad_norm": 0.019764013588428497, | |
| "learning_rate": 1.7459239798693364e-05, | |
| "loss": 0.0491, | |
| "num_input_tokens_seen": 180128, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 6.444444444444445, | |
| "grad_norm": 1.391746997833252, | |
| "learning_rate": 1.699845320607571e-05, | |
| "loss": 0.0969, | |
| "num_input_tokens_seen": 181632, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 2.3588414192199707, | |
| "learning_rate": 1.6540675667967974e-05, | |
| "loss": 0.1793, | |
| "num_input_tokens_seen": 183200, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "eval_loss": 0.11848799884319305, | |
| "eval_runtime": 0.5778, | |
| "eval_samples_per_second": 69.23, | |
| "eval_steps_per_second": 17.308, | |
| "num_input_tokens_seen": 183200, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 6.555555555555555, | |
| "grad_norm": 0.029464751482009888, | |
| "learning_rate": 1.60860793357805e-05, | |
| "loss": 0.0266, | |
| "num_input_tokens_seen": 184800, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 6.611111111111111, | |
| "grad_norm": 0.42808595299720764, | |
| "learning_rate": 1.56348351646022e-05, | |
| "loss": 0.0481, | |
| "num_input_tokens_seen": 186336, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.4245648980140686, | |
| "learning_rate": 1.5187112848911323e-05, | |
| "loss": 0.0511, | |
| "num_input_tokens_seen": 187904, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.722222222222222, | |
| "grad_norm": 1.6949938535690308, | |
| "learning_rate": 1.47430807587603e-05, | |
| "loss": 0.0723, | |
| "num_input_tokens_seen": 189472, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 6.777777777777778, | |
| "grad_norm": 0.24324463307857513, | |
| "learning_rate": 1.430290587645865e-05, | |
| "loss": 0.0887, | |
| "num_input_tokens_seen": 191072, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 6.833333333333333, | |
| "grad_norm": 0.14602553844451904, | |
| "learning_rate": 1.3866753733777765e-05, | |
| "loss": 0.0178, | |
| "num_input_tokens_seen": 192608, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 6.888888888888889, | |
| "grad_norm": 0.033647798001766205, | |
| "learning_rate": 1.343478834970121e-05, | |
| "loss": 0.057, | |
| "num_input_tokens_seen": 194208, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 6.944444444444445, | |
| "grad_norm": 0.33840593695640564, | |
| "learning_rate": 1.3007172168743854e-05, | |
| "loss": 0.0123, | |
| "num_input_tokens_seen": 195776, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.5205152630805969, | |
| "learning_rate": 1.2584065999863102e-05, | |
| "loss": 0.0722, | |
| "num_input_tokens_seen": 197344, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.11773133277893066, | |
| "eval_runtime": 0.5747, | |
| "eval_samples_per_second": 69.6, | |
| "eval_steps_per_second": 17.4, | |
| "num_input_tokens_seen": 197344, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.055555555555555, | |
| "grad_norm": 1.2121227979660034, | |
| "learning_rate": 1.2165628955985314e-05, | |
| "loss": 0.1661, | |
| "num_input_tokens_seen": 198944, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 7.111111111111111, | |
| "grad_norm": 1.0526676177978516, | |
| "learning_rate": 1.175201839416988e-05, | |
| "loss": 0.1353, | |
| "num_input_tokens_seen": 200512, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.166666666666667, | |
| "grad_norm": 0.04684647545218468, | |
| "learning_rate": 1.1343389856433658e-05, | |
| "loss": 0.0109, | |
| "num_input_tokens_seen": 202016, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 7.222222222222222, | |
| "grad_norm": 0.8230001330375671, | |
| "learning_rate": 1.0939897011258001e-05, | |
| "loss": 0.0614, | |
| "num_input_tokens_seen": 203648, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 7.277777777777778, | |
| "grad_norm": 0.3935108184814453, | |
| "learning_rate": 1.0541691595800337e-05, | |
| "loss": 0.174, | |
| "num_input_tokens_seen": 205184, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 7.333333333333333, | |
| "grad_norm": 0.40703386068344116, | |
| "learning_rate": 1.0148923358832022e-05, | |
| "loss": 0.0445, | |
| "num_input_tokens_seen": 206720, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 7.388888888888889, | |
| "grad_norm": 0.33129680156707764, | |
| "learning_rate": 9.761740004423927e-06, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 208320, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 7.444444444444445, | |
| "grad_norm": 0.010633599944412708, | |
| "learning_rate": 9.380287136401e-06, | |
| "loss": 0.0068, | |
| "num_input_tokens_seen": 209856, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.9165171980857849, | |
| "learning_rate": 9.00470820358663e-06, | |
| "loss": 0.0602, | |
| "num_input_tokens_seen": 211392, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "eval_loss": 0.11858992278575897, | |
| "eval_runtime": 0.5724, | |
| "eval_samples_per_second": 69.885, | |
| "eval_steps_per_second": 17.471, | |
| "num_input_tokens_seen": 211392, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 7.555555555555555, | |
| "grad_norm": 0.12652786076068878, | |
| "learning_rate": 8.635144445857406e-06, | |
| "loss": 0.0055, | |
| "num_input_tokens_seen": 212960, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 7.611111111111111, | |
| "grad_norm": 0.17106826603412628, | |
| "learning_rate": 8.271734841028553e-06, | |
| "loss": 0.0238, | |
| "num_input_tokens_seen": 214528, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 0.5190755724906921, | |
| "learning_rate": 7.914616052590071e-06, | |
| "loss": 0.0295, | |
| "num_input_tokens_seen": 216000, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 7.722222222222222, | |
| "grad_norm": 0.5266556739807129, | |
| "learning_rate": 7.563922378313218e-06, | |
| "loss": 0.1226, | |
| "num_input_tokens_seen": 217632, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 7.777777777777778, | |
| "grad_norm": 0.021354423835873604, | |
| "learning_rate": 7.219785699746573e-06, | |
| "loss": 0.0781, | |
| "num_input_tokens_seen": 219232, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 7.833333333333333, | |
| "grad_norm": 0.04843864589929581, | |
| "learning_rate": 6.882335432620779e-06, | |
| "loss": 0.0923, | |
| "num_input_tokens_seen": 220800, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 7.888888888888889, | |
| "grad_norm": 0.022320907562971115, | |
| "learning_rate": 6.55169847818059e-06, | |
| "loss": 0.0177, | |
| "num_input_tokens_seen": 222368, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 7.944444444444445, | |
| "grad_norm": 0.9886584281921387, | |
| "learning_rate": 6.22799917546252e-06, | |
| "loss": 0.0276, | |
| "num_input_tokens_seen": 223968, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.9058972001075745, | |
| "learning_rate": 5.9113592545359945e-06, | |
| "loss": 0.0334, | |
| "num_input_tokens_seen": 225536, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.12042172253131866, | |
| "eval_runtime": 0.584, | |
| "eval_samples_per_second": 68.49, | |
| "eval_steps_per_second": 17.122, | |
| "num_input_tokens_seen": 225536, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.055555555555555, | |
| "grad_norm": 0.4627024233341217, | |
| "learning_rate": 5.601897790725643e-06, | |
| "loss": 0.043, | |
| "num_input_tokens_seen": 227168, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 8.11111111111111, | |
| "grad_norm": 0.03396330401301384, | |
| "learning_rate": 5.299731159831953e-06, | |
| "loss": 0.0093, | |
| "num_input_tokens_seen": 228704, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 8.166666666666666, | |
| "grad_norm": 0.2798207998275757, | |
| "learning_rate": 5.004972994367102e-06, | |
| "loss": 0.012, | |
| "num_input_tokens_seen": 230336, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 8.222222222222221, | |
| "grad_norm": 0.033361539244651794, | |
| "learning_rate": 4.7177341408224e-06, | |
| "loss": 0.1495, | |
| "num_input_tokens_seen": 231936, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 8.277777777777779, | |
| "grad_norm": 0.08061393350362778, | |
| "learning_rate": 4.438122617983443e-06, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 233472, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.028773680329322815, | |
| "learning_rate": 4.166243576308712e-06, | |
| "loss": 0.0641, | |
| "num_input_tokens_seen": 235040, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 8.38888888888889, | |
| "grad_norm": 0.13998565077781677, | |
| "learning_rate": 3.9021992583867325e-06, | |
| "loss": 0.0362, | |
| "num_input_tokens_seen": 236608, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 8.444444444444445, | |
| "grad_norm": 0.12831299006938934, | |
| "learning_rate": 3.6460889604868626e-06, | |
| "loss": 0.1365, | |
| "num_input_tokens_seen": 238144, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 0.9268062114715576, | |
| "learning_rate": 3.398008995217988e-06, | |
| "loss": 0.0266, | |
| "num_input_tokens_seen": 239680, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "eval_loss": 0.11734791100025177, | |
| "eval_runtime": 0.5869, | |
| "eval_samples_per_second": 68.159, | |
| "eval_steps_per_second": 17.04, | |
| "num_input_tokens_seen": 239680, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 8.555555555555555, | |
| "grad_norm": 0.01520704198628664, | |
| "learning_rate": 3.158052655309332e-06, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 241280, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 8.61111111111111, | |
| "grad_norm": 0.2806644141674042, | |
| "learning_rate": 2.9263101785268254e-06, | |
| "loss": 0.1713, | |
| "num_input_tokens_seen": 242816, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 8.666666666666666, | |
| "grad_norm": 0.5216343998908997, | |
| "learning_rate": 2.7028687137384267e-06, | |
| "loss": 0.081, | |
| "num_input_tokens_seen": 244352, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 8.722222222222221, | |
| "grad_norm": 0.030346965417265892, | |
| "learning_rate": 2.487812288140945e-06, | |
| "loss": 0.0446, | |
| "num_input_tokens_seen": 245856, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 8.777777777777779, | |
| "grad_norm": 1.0819330215454102, | |
| "learning_rate": 2.281221775660894e-06, | |
| "loss": 0.0602, | |
| "num_input_tokens_seen": 247456, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 8.833333333333334, | |
| "grad_norm": 0.17765626311302185, | |
| "learning_rate": 2.0831748665410765e-06, | |
| "loss": 0.0498, | |
| "num_input_tokens_seen": 248992, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 0.08204393833875656, | |
| "learning_rate": 1.893746038124497e-06, | |
| "loss": 0.1026, | |
| "num_input_tokens_seen": 250528, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 8.944444444444445, | |
| "grad_norm": 0.5647609829902649, | |
| "learning_rate": 1.713006526846439e-06, | |
| "loss": 0.0329, | |
| "num_input_tokens_seen": 252128, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 1.5207810401916504, | |
| "learning_rate": 1.541024301445404e-06, | |
| "loss": 0.046, | |
| "num_input_tokens_seen": 253696, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.11693229526281357, | |
| "eval_runtime": 0.5884, | |
| "eval_samples_per_second": 67.977, | |
| "eval_steps_per_second": 16.994, | |
| "num_input_tokens_seen": 253696, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 9.055555555555555, | |
| "grad_norm": 0.4018714129924774, | |
| "learning_rate": 1.3778640374027985e-06, | |
| "loss": 0.078, | |
| "num_input_tokens_seen": 255296, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 9.11111111111111, | |
| "grad_norm": 0.024602364748716354, | |
| "learning_rate": 1.2235870926211619e-06, | |
| "loss": 0.0619, | |
| "num_input_tokens_seen": 256896, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 9.166666666666666, | |
| "grad_norm": 1.3829312324523926, | |
| "learning_rate": 1.0782514843499653e-06, | |
| "loss": 0.0472, | |
| "num_input_tokens_seen": 258432, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 9.222222222222221, | |
| "grad_norm": 0.13033293187618256, | |
| "learning_rate": 9.419118673676924e-07, | |
| "loss": 0.02, | |
| "num_input_tokens_seen": 260000, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 9.277777777777779, | |
| "grad_norm": 0.18839669227600098, | |
| "learning_rate": 8.146195134284052e-07, | |
| "loss": 0.1806, | |
| "num_input_tokens_seen": 261568, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 9.333333333333334, | |
| "grad_norm": 0.35827043652534485, | |
| "learning_rate": 6.964222919805391e-07, | |
| "loss": 0.01, | |
| "num_input_tokens_seen": 263200, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 9.38888888888889, | |
| "grad_norm": 1.3823553323745728, | |
| "learning_rate": 5.87364652165176e-07, | |
| "loss": 0.0487, | |
| "num_input_tokens_seen": 264736, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 9.444444444444445, | |
| "grad_norm": 0.3393106460571289, | |
| "learning_rate": 4.874876061005173e-07, | |
| "loss": 0.0378, | |
| "num_input_tokens_seen": 266304, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 0.062399331480264664, | |
| "learning_rate": 3.9682871345891883e-07, | |
| "loss": 0.0042, | |
| "num_input_tokens_seen": 267840, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "eval_loss": 0.11859162151813507, | |
| "eval_runtime": 0.5737, | |
| "eval_samples_per_second": 69.721, | |
| "eval_steps_per_second": 17.43, | |
| "num_input_tokens_seen": 267840, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 9.555555555555555, | |
| "grad_norm": 0.039976976811885834, | |
| "learning_rate": 3.1542206734221924e-07, | |
| "loss": 0.0237, | |
| "num_input_tokens_seen": 269376, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 9.61111111111111, | |
| "grad_norm": 1.2259042263031006, | |
| "learning_rate": 2.4329828146074095e-07, | |
| "loss": 0.1186, | |
| "num_input_tokens_seen": 270944, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 9.666666666666666, | |
| "grad_norm": 0.3249455690383911, | |
| "learning_rate": 1.8048447862070718e-07, | |
| "loss": 0.1139, | |
| "num_input_tokens_seen": 272448, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 9.722222222222221, | |
| "grad_norm": 1.201401948928833, | |
| "learning_rate": 1.2700428052447033e-07, | |
| "loss": 0.1073, | |
| "num_input_tokens_seen": 273984, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 9.777777777777779, | |
| "grad_norm": 0.745436429977417, | |
| "learning_rate": 8.28777988873486e-08, | |
| "loss": 0.0366, | |
| "num_input_tokens_seen": 275520, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 9.833333333333334, | |
| "grad_norm": 0.009507009759545326, | |
| "learning_rate": 4.8121627874450625e-08, | |
| "loss": 0.0188, | |
| "num_input_tokens_seen": 277152, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 9.88888888888889, | |
| "grad_norm": 0.710180938243866, | |
| "learning_rate": 2.2748837860270267e-08, | |
| "loss": 0.0931, | |
| "num_input_tokens_seen": 278688, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 9.944444444444445, | |
| "grad_norm": 0.013682587072253227, | |
| "learning_rate": 6.768970513457151e-09, | |
| "loss": 0.0155, | |
| "num_input_tokens_seen": 280256, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.21826046705245972, | |
| "learning_rate": 1.8803520859811406e-10, | |
| "loss": 0.0783, | |
| "num_input_tokens_seen": 281856, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.1192222386598587, | |
| "eval_runtime": 0.5725, | |
| "eval_samples_per_second": 69.864, | |
| "eval_steps_per_second": 17.466, | |
| "num_input_tokens_seen": 281856, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "num_input_tokens_seen": 281856, | |
| "step": 900, | |
| "total_flos": 1.269218078097408e+16, | |
| "train_loss": 0.1295707409332196, | |
| "train_runtime": 167.4501, | |
| "train_samples_per_second": 21.499, | |
| "train_steps_per_second": 5.375 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 900, | |
| "num_input_tokens_seen": 281856, | |
| "num_train_epochs": 10, | |
| "save_steps": 45, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.269218078097408e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |