| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 200, |
| "global_step": 950, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.002105263157894737, |
| "grad_norm": 5.903966976524179, |
| "learning_rate": 9.999972660400536e-06, |
| "loss": 0.5085, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.004210526315789474, |
| "grad_norm": 4.938036117868723, |
| "learning_rate": 9.999890641901124e-06, |
| "loss": 0.4436, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.00631578947368421, |
| "grad_norm": 4.388144856014597, |
| "learning_rate": 9.999753945398704e-06, |
| "loss": 0.3735, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.008421052631578947, |
| "grad_norm": 3.677991014602486, |
| "learning_rate": 9.99956257238817e-06, |
| "loss": 0.3626, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.010526315789473684, |
| "grad_norm": 4.1261678577077365, |
| "learning_rate": 9.999316524962347e-06, |
| "loss": 0.3439, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.01263157894736842, |
| "grad_norm": 4.480913214762388, |
| "learning_rate": 9.999015805811965e-06, |
| "loss": 0.3425, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.014736842105263158, |
| "grad_norm": 3.6087816386136216, |
| "learning_rate": 9.998660418225645e-06, |
| "loss": 0.3288, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.016842105263157894, |
| "grad_norm": 3.8303469041456903, |
| "learning_rate": 9.998250366089848e-06, |
| "loss": 0.3631, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.018947368421052633, |
| "grad_norm": 4.051441024522369, |
| "learning_rate": 9.997785653888835e-06, |
| "loss": 0.3475, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.021052631578947368, |
| "grad_norm": 3.2811851205186113, |
| "learning_rate": 9.99726628670463e-06, |
| "loss": 0.2727, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.023157894736842106, |
| "grad_norm": 3.076267871254497, |
| "learning_rate": 9.996692270216946e-06, |
| "loss": 0.2735, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.02526315789473684, |
| "grad_norm": 4.218304225935037, |
| "learning_rate": 9.996063610703138e-06, |
| "loss": 0.3884, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.02736842105263158, |
| "grad_norm": 3.0056525766185564, |
| "learning_rate": 9.995380315038119e-06, |
| "loss": 0.2533, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.029473684210526315, |
| "grad_norm": 3.4173235428174, |
| "learning_rate": 9.994642390694308e-06, |
| "loss": 0.2708, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.031578947368421054, |
| "grad_norm": 3.804816278379765, |
| "learning_rate": 9.993849845741525e-06, |
| "loss": 0.3844, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.03368421052631579, |
| "grad_norm": 3.2085820806655585, |
| "learning_rate": 9.993002688846913e-06, |
| "loss": 0.34, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.035789473684210524, |
| "grad_norm": 3.343915010488622, |
| "learning_rate": 9.992100929274848e-06, |
| "loss": 0.3061, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.037894736842105266, |
| "grad_norm": 2.953981749061276, |
| "learning_rate": 9.991144576886824e-06, |
| "loss": 0.3124, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 3.065585554970114, |
| "learning_rate": 9.990133642141359e-06, |
| "loss": 0.2862, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.042105263157894736, |
| "grad_norm": 3.0851420366070537, |
| "learning_rate": 9.989068136093873e-06, |
| "loss": 0.2916, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04421052631578947, |
| "grad_norm": 3.201737423886665, |
| "learning_rate": 9.987948070396572e-06, |
| "loss": 0.288, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.04631578947368421, |
| "grad_norm": 2.859254132653785, |
| "learning_rate": 9.986773457298311e-06, |
| "loss": 0.2687, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.04842105263157895, |
| "grad_norm": 3.7432890508375283, |
| "learning_rate": 9.985544309644474e-06, |
| "loss": 0.3261, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.05052631578947368, |
| "grad_norm": 2.692301660805473, |
| "learning_rate": 9.984260640876821e-06, |
| "loss": 0.2282, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.05263157894736842, |
| "grad_norm": 4.281628785614743, |
| "learning_rate": 9.98292246503335e-06, |
| "loss": 0.4261, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.05473684210526316, |
| "grad_norm": 3.8562642975899055, |
| "learning_rate": 9.981529796748135e-06, |
| "loss": 0.3469, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.056842105263157895, |
| "grad_norm": 3.4474081351012, |
| "learning_rate": 9.980082651251175e-06, |
| "loss": 0.2656, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.05894736842105263, |
| "grad_norm": 3.3764979188026447, |
| "learning_rate": 9.97858104436822e-06, |
| "loss": 0.3002, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.061052631578947365, |
| "grad_norm": 4.197028644917946, |
| "learning_rate": 9.977024992520604e-06, |
| "loss": 0.3497, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.06315789473684211, |
| "grad_norm": 3.721020976898217, |
| "learning_rate": 9.975414512725058e-06, |
| "loss": 0.3571, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06526315789473684, |
| "grad_norm": 3.555769583954405, |
| "learning_rate": 9.973749622593534e-06, |
| "loss": 0.2659, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.06736842105263158, |
| "grad_norm": 3.4068044809170934, |
| "learning_rate": 9.972030340333e-06, |
| "loss": 0.2353, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.06947368421052631, |
| "grad_norm": 3.602249926130539, |
| "learning_rate": 9.970256684745258e-06, |
| "loss": 0.2838, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.07157894736842105, |
| "grad_norm": 3.1569700401997474, |
| "learning_rate": 9.968428675226714e-06, |
| "loss": 0.2556, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.07368421052631578, |
| "grad_norm": 3.2728677083109523, |
| "learning_rate": 9.966546331768192e-06, |
| "loss": 0.2962, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.07578947368421053, |
| "grad_norm": 3.067636390716889, |
| "learning_rate": 9.964609674954696e-06, |
| "loss": 0.2882, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.07789473684210527, |
| "grad_norm": 2.8096887544728935, |
| "learning_rate": 9.962618725965196e-06, |
| "loss": 0.233, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 3.3000170129014386, |
| "learning_rate": 9.960573506572391e-06, |
| "loss": 0.2534, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.08210526315789474, |
| "grad_norm": 3.5389529592924576, |
| "learning_rate": 9.95847403914247e-06, |
| "loss": 0.2772, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.08421052631578947, |
| "grad_norm": 3.453344441208766, |
| "learning_rate": 9.956320346634877e-06, |
| "loss": 0.272, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0863157894736842, |
| "grad_norm": 4.307183932380211, |
| "learning_rate": 9.954112452602045e-06, |
| "loss": 0.3301, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.08842105263157894, |
| "grad_norm": 2.9656779667391193, |
| "learning_rate": 9.951850381189152e-06, |
| "loss": 0.2625, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.09052631578947369, |
| "grad_norm": 3.5490499282339343, |
| "learning_rate": 9.949534157133844e-06, |
| "loss": 0.2999, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.09263157894736843, |
| "grad_norm": 3.113609674175823, |
| "learning_rate": 9.94716380576598e-06, |
| "loss": 0.2757, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.09473684210526316, |
| "grad_norm": 2.9818002459346573, |
| "learning_rate": 9.944739353007344e-06, |
| "loss": 0.2816, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.0968421052631579, |
| "grad_norm": 3.434946218127203, |
| "learning_rate": 9.942260825371359e-06, |
| "loss": 0.212, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.09894736842105263, |
| "grad_norm": 4.459352367699622, |
| "learning_rate": 9.939728249962808e-06, |
| "loss": 0.4215, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.10105263157894737, |
| "grad_norm": 3.612646296219494, |
| "learning_rate": 9.937141654477529e-06, |
| "loss": 0.3106, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.1031578947368421, |
| "grad_norm": 3.243176699962646, |
| "learning_rate": 9.934501067202117e-06, |
| "loss": 0.2759, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.10526315789473684, |
| "grad_norm": 3.5224479463075893, |
| "learning_rate": 9.931806517013612e-06, |
| "loss": 0.3075, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10736842105263159, |
| "grad_norm": 3.092501849170216, |
| "learning_rate": 9.929058033379181e-06, |
| "loss": 0.3048, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.10947368421052632, |
| "grad_norm": 4.090711703984495, |
| "learning_rate": 9.926255646355804e-06, |
| "loss": 0.2582, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.11157894736842106, |
| "grad_norm": 3.5682754090683435, |
| "learning_rate": 9.923399386589933e-06, |
| "loss": 0.3061, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.11368421052631579, |
| "grad_norm": 3.348524601466594, |
| "learning_rate": 9.920489285317169e-06, |
| "loss": 0.2372, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.11578947368421053, |
| "grad_norm": 3.1821874258149823, |
| "learning_rate": 9.917525374361913e-06, |
| "loss": 0.2545, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.11789473684210526, |
| "grad_norm": 3.539674414581924, |
| "learning_rate": 9.91450768613702e-06, |
| "loss": 0.3059, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 3.4913151862965317, |
| "learning_rate": 9.911436253643445e-06, |
| "loss": 0.3316, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.12210526315789473, |
| "grad_norm": 4.020069826990793, |
| "learning_rate": 9.908311110469881e-06, |
| "loss": 0.3612, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.12421052631578948, |
| "grad_norm": 2.8568567871983936, |
| "learning_rate": 9.905132290792395e-06, |
| "loss": 0.2476, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.12631578947368421, |
| "grad_norm": 3.261450325703877, |
| "learning_rate": 9.901899829374048e-06, |
| "loss": 0.2954, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.12842105263157894, |
| "grad_norm": 2.7362895761373807, |
| "learning_rate": 9.89861376156452e-06, |
| "loss": 0.2721, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.13052631578947368, |
| "grad_norm": 3.11244127366191, |
| "learning_rate": 9.895274123299724e-06, |
| "loss": 0.298, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.13263157894736843, |
| "grad_norm": 3.6632993774201568, |
| "learning_rate": 9.891880951101407e-06, |
| "loss": 0.2902, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.13473684210526315, |
| "grad_norm": 3.320679243796475, |
| "learning_rate": 9.888434282076759e-06, |
| "loss": 0.254, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.1368421052631579, |
| "grad_norm": 3.532964946280944, |
| "learning_rate": 9.884934153917998e-06, |
| "loss": 0.3431, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.13894736842105262, |
| "grad_norm": 3.64231126319289, |
| "learning_rate": 9.881380604901964e-06, |
| "loss": 0.3407, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.14105263157894737, |
| "grad_norm": 3.1390516154053008, |
| "learning_rate": 9.877773673889702e-06, |
| "loss": 0.2751, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.1431578947368421, |
| "grad_norm": 3.4517465264493046, |
| "learning_rate": 9.874113400326031e-06, |
| "loss": 0.2667, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.14526315789473684, |
| "grad_norm": 3.2206234405147036, |
| "learning_rate": 9.870399824239116e-06, |
| "loss": 0.283, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.14736842105263157, |
| "grad_norm": 2.9995284377277542, |
| "learning_rate": 9.86663298624003e-06, |
| "loss": 0.2184, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.14947368421052631, |
| "grad_norm": 2.999026879401691, |
| "learning_rate": 9.86281292752231e-06, |
| "loss": 0.2429, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.15157894736842106, |
| "grad_norm": 2.766840628283391, |
| "learning_rate": 9.858939689861506e-06, |
| "loss": 0.2379, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.15368421052631578, |
| "grad_norm": 3.8140293080779113, |
| "learning_rate": 9.855013315614725e-06, |
| "loss": 0.2823, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.15578947368421053, |
| "grad_norm": 3.3982807201281147, |
| "learning_rate": 9.851033847720167e-06, |
| "loss": 0.2992, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.15789473684210525, |
| "grad_norm": 2.931894314971863, |
| "learning_rate": 9.847001329696653e-06, |
| "loss": 0.1989, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 3.5469960575121307, |
| "learning_rate": 9.842915805643156e-06, |
| "loss": 0.3598, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.16210526315789472, |
| "grad_norm": 3.213106605680639, |
| "learning_rate": 9.838777320238312e-06, |
| "loss": 0.3194, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.16421052631578947, |
| "grad_norm": 4.433153434861342, |
| "learning_rate": 9.834585918739936e-06, |
| "loss": 0.3447, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.16631578947368422, |
| "grad_norm": 3.0884694722442303, |
| "learning_rate": 9.830341646984521e-06, |
| "loss": 0.26, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.16842105263157894, |
| "grad_norm": 3.3484965798832493, |
| "learning_rate": 9.826044551386743e-06, |
| "loss": 0.2775, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.1705263157894737, |
| "grad_norm": 3.4971689879223833, |
| "learning_rate": 9.821694678938954e-06, |
| "loss": 0.3016, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.1726315789473684, |
| "grad_norm": 3.282170006280911, |
| "learning_rate": 9.817292077210658e-06, |
| "loss": 0.3392, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.17473684210526316, |
| "grad_norm": 3.839326039283396, |
| "learning_rate": 9.812836794348005e-06, |
| "loss": 0.3436, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.17684210526315788, |
| "grad_norm": 3.942224451339831, |
| "learning_rate": 9.808328879073251e-06, |
| "loss": 0.3596, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.17894736842105263, |
| "grad_norm": 3.3687590727357355, |
| "learning_rate": 9.803768380684242e-06, |
| "loss": 0.3408, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.18105263157894738, |
| "grad_norm": 2.8534809428590497, |
| "learning_rate": 9.79915534905385e-06, |
| "loss": 0.2532, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.1831578947368421, |
| "grad_norm": 3.020705734497323, |
| "learning_rate": 9.794489834629457e-06, |
| "loss": 0.2882, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.18526315789473685, |
| "grad_norm": 2.7674834071058796, |
| "learning_rate": 9.789771888432375e-06, |
| "loss": 0.2338, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.18736842105263157, |
| "grad_norm": 3.4431668867918694, |
| "learning_rate": 9.785001562057311e-06, |
| "loss": 0.2596, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.18947368421052632, |
| "grad_norm": 3.4078628366971433, |
| "learning_rate": 9.780178907671788e-06, |
| "loss": 0.3006, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.19157894736842104, |
| "grad_norm": 3.3261364660865707, |
| "learning_rate": 9.775303978015585e-06, |
| "loss": 0.2951, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.1936842105263158, |
| "grad_norm": 3.4790214002153803, |
| "learning_rate": 9.77037682640015e-06, |
| "loss": 0.3475, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.1957894736842105, |
| "grad_norm": 3.6562472061364484, |
| "learning_rate": 9.765397506708023e-06, |
| "loss": 0.3599, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.19789473684210526, |
| "grad_norm": 3.3045836139909994, |
| "learning_rate": 9.760366073392246e-06, |
| "loss": 0.2597, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 3.3815788433479175, |
| "learning_rate": 9.755282581475769e-06, |
| "loss": 0.3071, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.20210526315789473, |
| "grad_norm": 3.045252776887845, |
| "learning_rate": 9.750147086550843e-06, |
| "loss": 0.2225, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.20421052631578948, |
| "grad_norm": 3.373471179536508, |
| "learning_rate": 9.744959644778422e-06, |
| "loss": 0.3346, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.2063157894736842, |
| "grad_norm": 3.9726149132566326, |
| "learning_rate": 9.739720312887536e-06, |
| "loss": 0.3535, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.20842105263157895, |
| "grad_norm": 3.1493223747136225, |
| "learning_rate": 9.734429148174676e-06, |
| "loss": 0.2348, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.21052631578947367, |
| "grad_norm": 3.2666485311731583, |
| "learning_rate": 9.729086208503174e-06, |
| "loss": 0.3287, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.21263157894736842, |
| "grad_norm": 3.560150543272795, |
| "learning_rate": 9.723691552302563e-06, |
| "loss": 0.2926, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.21473684210526317, |
| "grad_norm": 3.7358995280744938, |
| "learning_rate": 9.718245238567939e-06, |
| "loss": 0.3386, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.2168421052631579, |
| "grad_norm": 2.915691266754973, |
| "learning_rate": 9.712747326859316e-06, |
| "loss": 0.2469, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.21894736842105264, |
| "grad_norm": 3.765457080419128, |
| "learning_rate": 9.707197877300974e-06, |
| "loss": 0.351, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.22105263157894736, |
| "grad_norm": 3.0668748196886395, |
| "learning_rate": 9.701596950580807e-06, |
| "loss": 0.2802, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.2231578947368421, |
| "grad_norm": 4.1412641761679465, |
| "learning_rate": 9.69594460794965e-06, |
| "loss": 0.2647, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.22526315789473683, |
| "grad_norm": 2.8209337016614167, |
| "learning_rate": 9.690240911220618e-06, |
| "loss": 0.2186, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.22736842105263158, |
| "grad_norm": 2.8786177165081424, |
| "learning_rate": 9.684485922768422e-06, |
| "loss": 0.231, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.2294736842105263, |
| "grad_norm": 3.324372122776949, |
| "learning_rate": 9.678679705528699e-06, |
| "loss": 0.2543, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.23157894736842105, |
| "grad_norm": 3.925045356831471, |
| "learning_rate": 9.672822322997305e-06, |
| "loss": 0.281, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.2336842105263158, |
| "grad_norm": 3.168490279807808, |
| "learning_rate": 9.666913839229639e-06, |
| "loss": 0.298, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.23578947368421052, |
| "grad_norm": 3.2577818549862867, |
| "learning_rate": 9.660954318839934e-06, |
| "loss": 0.2663, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.23789473684210527, |
| "grad_norm": 3.8656752201735984, |
| "learning_rate": 9.654943827000548e-06, |
| "loss": 0.2712, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 3.5154758832724218, |
| "learning_rate": 9.648882429441258e-06, |
| "loss": 0.2731, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.24210526315789474, |
| "grad_norm": 2.8687141447582376, |
| "learning_rate": 9.642770192448537e-06, |
| "loss": 0.2452, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.24421052631578946, |
| "grad_norm": 3.6957846757319532, |
| "learning_rate": 9.636607182864828e-06, |
| "loss": 0.3313, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.2463157894736842, |
| "grad_norm": 2.9551806554723603, |
| "learning_rate": 9.630393468087818e-06, |
| "loss": 0.2452, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.24842105263157896, |
| "grad_norm": 3.2418146201338076, |
| "learning_rate": 9.624129116069695e-06, |
| "loss": 0.2902, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.2505263157894737, |
| "grad_norm": 3.42312293812338, |
| "learning_rate": 9.61781419531641e-06, |
| "loss": 0.3117, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.25263157894736843, |
| "grad_norm": 3.7071145440083355, |
| "learning_rate": 9.611448774886925e-06, |
| "loss": 0.3434, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.25473684210526315, |
| "grad_norm": 3.2140221247594725, |
| "learning_rate": 9.605032924392457e-06, |
| "loss": 0.296, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.25684210526315787, |
| "grad_norm": 3.5032254608727325, |
| "learning_rate": 9.598566713995718e-06, |
| "loss": 0.2626, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.25894736842105265, |
| "grad_norm": 3.15141293077419, |
| "learning_rate": 9.592050214410152e-06, |
| "loss": 0.2757, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.26105263157894737, |
| "grad_norm": 3.6472637355316255, |
| "learning_rate": 9.585483496899151e-06, |
| "loss": 0.2756, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.2631578947368421, |
| "grad_norm": 2.951085484163403, |
| "learning_rate": 9.578866633275289e-06, |
| "loss": 0.2277, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.26526315789473687, |
| "grad_norm": 3.6608344418079124, |
| "learning_rate": 9.572199695899522e-06, |
| "loss": 0.3656, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.2673684210526316, |
| "grad_norm": 2.875867453563808, |
| "learning_rate": 9.565482757680415e-06, |
| "loss": 0.2981, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.2694736842105263, |
| "grad_norm": 2.969989429575355, |
| "learning_rate": 9.558715892073324e-06, |
| "loss": 0.3036, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.27157894736842103, |
| "grad_norm": 2.8534681274152036, |
| "learning_rate": 9.551899173079607e-06, |
| "loss": 0.2927, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.2736842105263158, |
| "grad_norm": 3.082901025759934, |
| "learning_rate": 9.545032675245814e-06, |
| "loss": 0.3322, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.27578947368421053, |
| "grad_norm": 3.5290876802965996, |
| "learning_rate": 9.538116473662862e-06, |
| "loss": 0.3379, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.27789473684210525, |
| "grad_norm": 3.0428232899935406, |
| "learning_rate": 9.531150643965224e-06, |
| "loss": 0.2357, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 3.5333703853706457, |
| "learning_rate": 9.524135262330098e-06, |
| "loss": 0.2428, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.28210526315789475, |
| "grad_norm": 3.0475010473798005, |
| "learning_rate": 9.517070405476575e-06, |
| "loss": 0.2935, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.28421052631578947, |
| "grad_norm": 3.240537024042731, |
| "learning_rate": 9.509956150664796e-06, |
| "loss": 0.2613, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.2863157894736842, |
| "grad_norm": 2.9128730104491973, |
| "learning_rate": 9.502792575695112e-06, |
| "loss": 0.2347, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.28842105263157897, |
| "grad_norm": 3.330795542026535, |
| "learning_rate": 9.495579758907231e-06, |
| "loss": 0.3263, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.2905263157894737, |
| "grad_norm": 3.2067221175533023, |
| "learning_rate": 9.48831777917936e-06, |
| "loss": 0.3092, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.2926315789473684, |
| "grad_norm": 3.0353623237386813, |
| "learning_rate": 9.481006715927352e-06, |
| "loss": 0.2901, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.29473684210526313, |
| "grad_norm": 3.477639320680317, |
| "learning_rate": 9.473646649103819e-06, |
| "loss": 0.2697, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.2968421052631579, |
| "grad_norm": 2.721056544196183, |
| "learning_rate": 9.466237659197271e-06, |
| "loss": 0.2241, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.29894736842105263, |
| "grad_norm": 3.163833843700446, |
| "learning_rate": 9.458779827231237e-06, |
| "loss": 0.2656, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.30105263157894735, |
| "grad_norm": 3.110748967764754, |
| "learning_rate": 9.451273234763372e-06, |
| "loss": 0.2617, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.3031578947368421, |
| "grad_norm": 3.0170106549603504, |
| "learning_rate": 9.443717963884568e-06, |
| "loss": 0.2447, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.30526315789473685, |
| "grad_norm": 3.3315550272526844, |
| "learning_rate": 9.43611409721806e-06, |
| "loss": 0.3067, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.30736842105263157, |
| "grad_norm": 3.1066342898280905, |
| "learning_rate": 9.428461717918512e-06, |
| "loss": 0.3262, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.3094736842105263, |
| "grad_norm": 3.3496341428081315, |
| "learning_rate": 9.420760909671119e-06, |
| "loss": 0.2454, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.31157894736842107, |
| "grad_norm": 3.421650153297572, |
| "learning_rate": 9.413011756690686e-06, |
| "loss": 0.3029, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.3136842105263158, |
| "grad_norm": 3.044311191542287, |
| "learning_rate": 9.405214343720708e-06, |
| "loss": 0.2959, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.3157894736842105, |
| "grad_norm": 2.7783609067575545, |
| "learning_rate": 9.397368756032445e-06, |
| "loss": 0.2575, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3178947368421053, |
| "grad_norm": 3.1794076058884895, |
| "learning_rate": 9.389475079423988e-06, |
| "loss": 0.2782, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 4.565412279405237, |
| "learning_rate": 9.381533400219319e-06, |
| "loss": 0.3381, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.32210526315789473, |
| "grad_norm": 3.7246780777461637, |
| "learning_rate": 9.373543805267367e-06, |
| "loss": 0.2958, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.32421052631578945, |
| "grad_norm": 3.056241661591908, |
| "learning_rate": 9.365506381941066e-06, |
| "loss": 0.281, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.3263157894736842, |
| "grad_norm": 3.366326120576616, |
| "learning_rate": 9.357421218136387e-06, |
| "loss": 0.202, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.32842105263157895, |
| "grad_norm": 3.457666911533099, |
| "learning_rate": 9.349288402271387e-06, |
| "loss": 0.3005, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.33052631578947367, |
| "grad_norm": 2.3640574343694096, |
| "learning_rate": 9.341108023285239e-06, |
| "loss": 0.1791, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.33263157894736844, |
| "grad_norm": 3.5072575679163203, |
| "learning_rate": 9.332880170637252e-06, |
| "loss": 0.2887, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.33473684210526317, |
| "grad_norm": 4.034163555212422, |
| "learning_rate": 9.324604934305911e-06, |
| "loss": 0.2583, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.3368421052631579, |
| "grad_norm": 3.4137170461677178, |
| "learning_rate": 9.31628240478787e-06, |
| "loss": 0.3196, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.3389473684210526, |
| "grad_norm": 3.5366155768451297, |
| "learning_rate": 9.30791267309698e-06, |
| "loss": 0.2977, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.3410526315789474, |
| "grad_norm": 3.2286495303508684, |
| "learning_rate": 9.299495830763285e-06, |
| "loss": 0.3006, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.3431578947368421, |
| "grad_norm": 3.515490852518791, |
| "learning_rate": 9.291031969832026e-06, |
| "loss": 0.276, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.3452631578947368, |
| "grad_norm": 3.8556810694559855, |
| "learning_rate": 9.28252118286263e-06, |
| "loss": 0.2854, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.3473684210526316, |
| "grad_norm": 3.275312388330099, |
| "learning_rate": 9.273963562927695e-06, |
| "loss": 0.2563, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.3494736842105263, |
| "grad_norm": 3.491765481220755, |
| "learning_rate": 9.265359203611988e-06, |
| "loss": 0.3133, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.35157894736842105, |
| "grad_norm": 3.473334617513911, |
| "learning_rate": 9.256708199011402e-06, |
| "loss": 0.2385, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.35368421052631577, |
| "grad_norm": 3.397043567474343, |
| "learning_rate": 9.248010643731936e-06, |
| "loss": 0.247, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.35578947368421054, |
| "grad_norm": 3.649860343855273, |
| "learning_rate": 9.23926663288866e-06, |
| "loss": 0.2802, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.35789473684210527, |
| "grad_norm": 3.813391941041034, |
| "learning_rate": 9.230476262104678e-06, |
| "loss": 0.3087, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 3.6905074018171344, |
| "learning_rate": 9.221639627510076e-06, |
| "loss": 0.3462, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.36210526315789476, |
| "grad_norm": 3.6369332762547124, |
| "learning_rate": 9.212756825740874e-06, |
| "loss": 0.2893, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.3642105263157895, |
| "grad_norm": 3.124067402795342, |
| "learning_rate": 9.203827953937969e-06, |
| "loss": 0.2621, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.3663157894736842, |
| "grad_norm": 4.009622033887673, |
| "learning_rate": 9.194853109746073e-06, |
| "loss": 0.2958, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.3684210526315789, |
| "grad_norm": 3.277654059786843, |
| "learning_rate": 9.185832391312644e-06, |
| "loss": 0.2765, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.3705263157894737, |
| "grad_norm": 2.77349749923877, |
| "learning_rate": 9.176765897286812e-06, |
| "loss": 0.2127, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.3726315789473684, |
| "grad_norm": 3.527606374595019, |
| "learning_rate": 9.167653726818305e-06, |
| "loss": 0.3067, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.37473684210526315, |
| "grad_norm": 2.6924440680853947, |
| "learning_rate": 9.15849597955636e-06, |
| "loss": 0.2259, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.37684210526315787, |
| "grad_norm": 3.886574108001305, |
| "learning_rate": 9.149292755648631e-06, |
| "loss": 0.2923, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.37894736842105264, |
| "grad_norm": 2.818379903417439, |
| "learning_rate": 9.140044155740102e-06, |
| "loss": 0.2295, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.38105263157894737, |
| "grad_norm": 3.2260018781876623, |
| "learning_rate": 9.130750280971978e-06, |
| "loss": 0.3383, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.3831578947368421, |
| "grad_norm": 2.739913862532641, |
| "learning_rate": 9.121411232980589e-06, |
| "loss": 0.2404, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.38526315789473686, |
| "grad_norm": 3.199557557458389, |
| "learning_rate": 9.112027113896262e-06, |
| "loss": 0.288, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.3873684210526316, |
| "grad_norm": 3.757331027330083, |
| "learning_rate": 9.102598026342223e-06, |
| "loss": 0.2045, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.3894736842105263, |
| "grad_norm": 3.799798504081319, |
| "learning_rate": 9.093124073433464e-06, |
| "loss": 0.3113, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.391578947368421, |
| "grad_norm": 2.9479882498425405, |
| "learning_rate": 9.083605358775612e-06, |
| "loss": 0.2591, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.3936842105263158, |
| "grad_norm": 2.8984352246005636, |
| "learning_rate": 9.074041986463808e-06, |
| "loss": 0.2094, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.3957894736842105, |
| "grad_norm": 3.1258831680405943, |
| "learning_rate": 9.064434061081562e-06, |
| "loss": 0.2644, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.39789473684210525, |
| "grad_norm": 3.8610806420066046, |
| "learning_rate": 9.0547816876996e-06, |
| "loss": 0.3445, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 3.0731247252689755, |
| "learning_rate": 9.045084971874738e-06, |
| "loss": 0.2819, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.40210526315789474, |
| "grad_norm": 2.7506464613941524, |
| "learning_rate": 9.035344019648701e-06, |
| "loss": 0.2222, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.40421052631578946, |
| "grad_norm": 3.551940976528996, |
| "learning_rate": 9.025558937546987e-06, |
| "loss": 0.4067, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.4063157894736842, |
| "grad_norm": 3.0238265322464923, |
| "learning_rate": 9.015729832577681e-06, |
| "loss": 0.2453, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.40842105263157896, |
| "grad_norm": 3.177883029033834, |
| "learning_rate": 9.005856812230304e-06, |
| "loss": 0.2448, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.4105263157894737, |
| "grad_norm": 2.9637061343290427, |
| "learning_rate": 8.995939984474624e-06, |
| "loss": 0.2293, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.4126315789473684, |
| "grad_norm": 2.946477365180333, |
| "learning_rate": 8.98597945775948e-06, |
| "loss": 0.2538, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.4147368421052632, |
| "grad_norm": 3.165367892256443, |
| "learning_rate": 8.975975341011595e-06, |
| "loss": 0.3132, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.4168421052631579, |
| "grad_norm": 2.786548810153527, |
| "learning_rate": 8.96592774363439e-06, |
| "loss": 0.2135, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.4189473684210526, |
| "grad_norm": 3.285599254850826, |
| "learning_rate": 8.955836775506776e-06, |
| "loss": 0.2379, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.42105263157894735, |
| "grad_norm": 4.108210407116029, |
| "learning_rate": 8.94570254698197e-06, |
| "loss": 0.2778, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.42105263157894735, |
| "eval_loss": 0.26044589281082153, |
| "eval_runtime": 0.9326, |
| "eval_samples_per_second": 41.818, |
| "eval_steps_per_second": 10.722, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4231578947368421, |
| "grad_norm": 3.5939015363403843, |
| "learning_rate": 8.935525168886263e-06, |
| "loss": 0.2727, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.42526315789473684, |
| "grad_norm": 3.266123999642131, |
| "learning_rate": 8.92530475251784e-06, |
| "loss": 0.2421, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.42736842105263156, |
| "grad_norm": 4.2885736406975825, |
| "learning_rate": 8.91504140964553e-06, |
| "loss": 0.3279, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.42947368421052634, |
| "grad_norm": 3.6581971350961715, |
| "learning_rate": 8.90473525250761e-06, |
| "loss": 0.2979, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.43157894736842106, |
| "grad_norm": 3.9824032374811025, |
| "learning_rate": 8.894386393810563e-06, |
| "loss": 0.2764, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.4336842105263158, |
| "grad_norm": 3.1254122691470183, |
| "learning_rate": 8.883994946727848e-06, |
| "loss": 0.2089, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.4357894736842105, |
| "grad_norm": 4.247423358470422, |
| "learning_rate": 8.873561024898668e-06, |
| "loss": 0.2691, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.4378947368421053, |
| "grad_norm": 3.0648535317590655, |
| "learning_rate": 8.863084742426719e-06, |
| "loss": 0.2043, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 3.068552327060875, |
| "learning_rate": 8.852566213878947e-06, |
| "loss": 0.2468, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.4421052631578947, |
| "grad_norm": 3.764319293504002, |
| "learning_rate": 8.842005554284296e-06, |
| "loss": 0.3041, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.4442105263157895, |
| "grad_norm": 3.28381437259028, |
| "learning_rate": 8.831402879132447e-06, |
| "loss": 0.2951, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.4463157894736842, |
| "grad_norm": 3.0292296611478173, |
| "learning_rate": 8.820758304372557e-06, |
| "loss": 0.2497, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.44842105263157894, |
| "grad_norm": 3.3884010965584945, |
| "learning_rate": 8.810071946411989e-06, |
| "loss": 0.2622, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.45052631578947366, |
| "grad_norm": 3.393874059981341, |
| "learning_rate": 8.799343922115045e-06, |
| "loss": 0.2708, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.45263157894736844, |
| "grad_norm": 2.909948972542914, |
| "learning_rate": 8.788574348801676e-06, |
| "loss": 0.2542, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.45473684210526316, |
| "grad_norm": 3.3522048696292694, |
| "learning_rate": 8.777763344246209e-06, |
| "loss": 0.2503, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.4568421052631579, |
| "grad_norm": 2.821449551025134, |
| "learning_rate": 8.766911026676063e-06, |
| "loss": 0.2493, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.4589473684210526, |
| "grad_norm": 3.5684952127975613, |
| "learning_rate": 8.756017514770444e-06, |
| "loss": 0.2407, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.4610526315789474, |
| "grad_norm": 3.7589924204118867, |
| "learning_rate": 8.745082927659048e-06, |
| "loss": 0.3244, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.4631578947368421, |
| "grad_norm": 3.408146894460298, |
| "learning_rate": 8.734107384920771e-06, |
| "loss": 0.3119, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.4652631578947368, |
| "grad_norm": 2.555651765279721, |
| "learning_rate": 8.72309100658239e-06, |
| "loss": 0.2244, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.4673684210526316, |
| "grad_norm": 2.868395817759443, |
| "learning_rate": 8.71203391311725e-06, |
| "loss": 0.227, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.4694736842105263, |
| "grad_norm": 3.2734154431001676, |
| "learning_rate": 8.700936225443958e-06, |
| "loss": 0.3075, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.47157894736842104, |
| "grad_norm": 3.1578793105562317, |
| "learning_rate": 8.689798064925049e-06, |
| "loss": 0.3325, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.47368421052631576, |
| "grad_norm": 2.6029524703601674, |
| "learning_rate": 8.67861955336566e-06, |
| "loss": 0.2634, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.47578947368421054, |
| "grad_norm": 2.488560140997595, |
| "learning_rate": 8.6674008130122e-06, |
| "loss": 0.2149, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.47789473684210526, |
| "grad_norm": 3.71862701964302, |
| "learning_rate": 8.65614196655102e-06, |
| "loss": 0.2992, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 3.2971708770584387, |
| "learning_rate": 8.644843137107058e-06, |
| "loss": 0.2897, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.48210526315789476, |
| "grad_norm": 2.7980397594087467, |
| "learning_rate": 8.633504448242504e-06, |
| "loss": 0.2262, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.4842105263157895, |
| "grad_norm": 2.7495976567100606, |
| "learning_rate": 8.622126023955446e-06, |
| "loss": 0.1996, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.4863157894736842, |
| "grad_norm": 3.2796491708704, |
| "learning_rate": 8.610707988678504e-06, |
| "loss": 0.2389, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.4884210526315789, |
| "grad_norm": 3.4418510848878943, |
| "learning_rate": 8.599250467277483e-06, |
| "loss": 0.3318, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.4905263157894737, |
| "grad_norm": 3.3793449517646366, |
| "learning_rate": 8.587753585050004e-06, |
| "loss": 0.2316, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.4926315789473684, |
| "grad_norm": 2.9552327613137646, |
| "learning_rate": 8.576217467724129e-06, |
| "loss": 0.2581, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.49473684210526314, |
| "grad_norm": 2.976640505829934, |
| "learning_rate": 8.564642241456986e-06, |
| "loss": 0.2318, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.4968421052631579, |
| "grad_norm": 3.34417602657529, |
| "learning_rate": 8.553028032833397e-06, |
| "loss": 0.1977, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.49894736842105264, |
| "grad_norm": 3.2756869130672746, |
| "learning_rate": 8.541374968864486e-06, |
| "loss": 0.2541, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.5010526315789474, |
| "grad_norm": 3.139891699786457, |
| "learning_rate": 8.529683176986295e-06, |
| "loss": 0.2076, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.5031578947368421, |
| "grad_norm": 3.4708450417927312, |
| "learning_rate": 8.517952785058385e-06, |
| "loss": 0.3085, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.5052631578947369, |
| "grad_norm": 3.172121164209103, |
| "learning_rate": 8.506183921362443e-06, |
| "loss": 0.2338, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5073684210526316, |
| "grad_norm": 3.3114937881451367, |
| "learning_rate": 8.494376714600878e-06, |
| "loss": 0.2751, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.5094736842105263, |
| "grad_norm": 3.300701046185496, |
| "learning_rate": 8.482531293895412e-06, |
| "loss": 0.257, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.511578947368421, |
| "grad_norm": 3.287351087582164, |
| "learning_rate": 8.470647788785665e-06, |
| "loss": 0.2511, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.5136842105263157, |
| "grad_norm": 3.2948211696172995, |
| "learning_rate": 8.458726329227748e-06, |
| "loss": 0.2761, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.5157894736842106, |
| "grad_norm": 3.042525996501928, |
| "learning_rate": 8.446767045592829e-06, |
| "loss": 0.2234, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.5178947368421053, |
| "grad_norm": 3.437886226784909, |
| "learning_rate": 8.434770068665723e-06, |
| "loss": 0.2827, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 3.007789073559327, |
| "learning_rate": 8.422735529643445e-06, |
| "loss": 0.2458, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.5221052631578947, |
| "grad_norm": 3.3888617933137035, |
| "learning_rate": 8.410663560133784e-06, |
| "loss": 0.2861, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.5242105263157895, |
| "grad_norm": 3.5687647723690015, |
| "learning_rate": 8.398554292153866e-06, |
| "loss": 0.2458, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.5263157894736842, |
| "grad_norm": 3.0507951054357476, |
| "learning_rate": 8.386407858128707e-06, |
| "loss": 0.3154, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5284210526315789, |
| "grad_norm": 2.863867309728824, |
| "learning_rate": 8.37422439088976e-06, |
| "loss": 0.2347, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.5305263157894737, |
| "grad_norm": 3.3374440655963156, |
| "learning_rate": 8.362004023673473e-06, |
| "loss": 0.2541, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.5326315789473685, |
| "grad_norm": 3.3399986517698754, |
| "learning_rate": 8.349746890119826e-06, |
| "loss": 0.268, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.5347368421052632, |
| "grad_norm": 2.8070218152646103, |
| "learning_rate": 8.337453124270864e-06, |
| "loss": 0.2391, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.5368421052631579, |
| "grad_norm": 3.0741729288875472, |
| "learning_rate": 8.325122860569241e-06, |
| "loss": 0.1825, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.5389473684210526, |
| "grad_norm": 3.3102116063181914, |
| "learning_rate": 8.31275623385675e-06, |
| "loss": 0.2482, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.5410526315789473, |
| "grad_norm": 3.403284847164163, |
| "learning_rate": 8.300353379372834e-06, |
| "loss": 0.2601, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.5431578947368421, |
| "grad_norm": 3.3396369078731882, |
| "learning_rate": 8.287914432753123e-06, |
| "loss": 0.2411, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.5452631578947369, |
| "grad_norm": 4.012947971042055, |
| "learning_rate": 8.275439530027948e-06, |
| "loss": 0.3046, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.5473684210526316, |
| "grad_norm": 3.3973537835566465, |
| "learning_rate": 8.262928807620843e-06, |
| "loss": 0.2689, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5494736842105263, |
| "grad_norm": 3.421404804284609, |
| "learning_rate": 8.250382402347066e-06, |
| "loss": 0.2591, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.5515789473684211, |
| "grad_norm": 2.8126406756186775, |
| "learning_rate": 8.237800451412095e-06, |
| "loss": 0.2222, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.5536842105263158, |
| "grad_norm": 3.3510651542940693, |
| "learning_rate": 8.225183092410128e-06, |
| "loss": 0.2697, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.5557894736842105, |
| "grad_norm": 2.986604644452873, |
| "learning_rate": 8.212530463322584e-06, |
| "loss": 0.2808, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.5578947368421052, |
| "grad_norm": 3.5548559595995957, |
| "learning_rate": 8.199842702516584e-06, |
| "loss": 0.291, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 3.6456873792089257, |
| "learning_rate": 8.18711994874345e-06, |
| "loss": 0.2706, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.5621052631578948, |
| "grad_norm": 3.733960403232091, |
| "learning_rate": 8.174362341137177e-06, |
| "loss": 0.266, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.5642105263157895, |
| "grad_norm": 3.4930660301469643, |
| "learning_rate": 8.161570019212921e-06, |
| "loss": 0.2699, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.5663157894736842, |
| "grad_norm": 3.0306590004360796, |
| "learning_rate": 8.148743122865463e-06, |
| "loss": 0.2661, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.5684210526315789, |
| "grad_norm": 3.773204424271571, |
| "learning_rate": 8.135881792367686e-06, |
| "loss": 0.3432, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5705263157894737, |
| "grad_norm": 3.1394554778302526, |
| "learning_rate": 8.12298616836904e-06, |
| "loss": 0.2436, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.5726315789473684, |
| "grad_norm": 2.8431644921557213, |
| "learning_rate": 8.110056391894005e-06, |
| "loss": 0.2228, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.5747368421052632, |
| "grad_norm": 3.2898202937032823, |
| "learning_rate": 8.097092604340543e-06, |
| "loss": 0.2782, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.5768421052631579, |
| "grad_norm": 3.7772688134474293, |
| "learning_rate": 8.084094947478556e-06, |
| "loss": 0.2909, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.5789473684210527, |
| "grad_norm": 3.094367907000018, |
| "learning_rate": 8.071063563448341e-06, |
| "loss": 0.2325, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.5810526315789474, |
| "grad_norm": 2.4776499929932534, |
| "learning_rate": 8.057998594759022e-06, |
| "loss": 0.1811, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.5831578947368421, |
| "grad_norm": 3.2492815402735284, |
| "learning_rate": 8.044900184287007e-06, |
| "loss": 0.2943, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.5852631578947368, |
| "grad_norm": 3.1427693790707383, |
| "learning_rate": 8.031768475274412e-06, |
| "loss": 0.2753, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.5873684210526315, |
| "grad_norm": 3.177805841670074, |
| "learning_rate": 8.018603611327505e-06, |
| "loss": 0.2545, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.5894736842105263, |
| "grad_norm": 3.776526328121304, |
| "learning_rate": 8.005405736415127e-06, |
| "loss": 0.3011, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5915789473684211, |
| "grad_norm": 3.3699727333670557, |
| "learning_rate": 7.992174994867124e-06, |
| "loss": 0.2121, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.5936842105263158, |
| "grad_norm": 2.8041575327507795, |
| "learning_rate": 7.978911531372764e-06, |
| "loss": 0.2449, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.5957894736842105, |
| "grad_norm": 3.331718845848788, |
| "learning_rate": 7.965615490979165e-06, |
| "loss": 0.277, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.5978947368421053, |
| "grad_norm": 3.729754183258392, |
| "learning_rate": 7.952287019089686e-06, |
| "loss": 0.2904, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 3.5336258021533884, |
| "learning_rate": 7.938926261462366e-06, |
| "loss": 0.2918, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.6021052631578947, |
| "grad_norm": 3.1709904925861125, |
| "learning_rate": 7.925533364208308e-06, |
| "loss": 0.2525, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.6042105263157894, |
| "grad_norm": 3.866549016558195, |
| "learning_rate": 7.912108473790092e-06, |
| "loss": 0.2392, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.6063157894736843, |
| "grad_norm": 3.516428202377018, |
| "learning_rate": 7.898651737020166e-06, |
| "loss": 0.3108, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.608421052631579, |
| "grad_norm": 3.828883844428092, |
| "learning_rate": 7.885163301059251e-06, |
| "loss": 0.2484, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.6105263157894737, |
| "grad_norm": 4.038586072604508, |
| "learning_rate": 7.871643313414718e-06, |
| "loss": 0.3028, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6126315789473684, |
| "grad_norm": 3.217055390670865, |
| "learning_rate": 7.858091921938989e-06, |
| "loss": 0.295, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.6147368421052631, |
| "grad_norm": 3.667294038950859, |
| "learning_rate": 7.844509274827907e-06, |
| "loss": 0.278, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.6168421052631579, |
| "grad_norm": 2.8686712875415314, |
| "learning_rate": 7.830895520619129e-06, |
| "loss": 0.2609, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.6189473684210526, |
| "grad_norm": 2.9401482502186167, |
| "learning_rate": 7.817250808190483e-06, |
| "loss": 0.2616, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.6210526315789474, |
| "grad_norm": 2.492677050804819, |
| "learning_rate": 7.803575286758365e-06, |
| "loss": 0.205, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.6231578947368421, |
| "grad_norm": 3.2739091101128235, |
| "learning_rate": 7.789869105876083e-06, |
| "loss": 0.2597, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.6252631578947369, |
| "grad_norm": 2.8382019555630333, |
| "learning_rate": 7.776132415432234e-06, |
| "loss": 0.2632, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.6273684210526316, |
| "grad_norm": 3.2912951032196847, |
| "learning_rate": 7.762365365649068e-06, |
| "loss": 0.2792, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.6294736842105263, |
| "grad_norm": 3.1441667052432853, |
| "learning_rate": 7.748568107080831e-06, |
| "loss": 0.3344, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.631578947368421, |
| "grad_norm": 2.687162590514958, |
| "learning_rate": 7.734740790612137e-06, |
| "loss": 0.2099, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6336842105263157, |
| "grad_norm": 3.6579379448846834, |
| "learning_rate": 7.720883567456299e-06, |
| "loss": 0.3209, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.6357894736842106, |
| "grad_norm": 3.4874070496305523, |
| "learning_rate": 7.70699658915369e-06, |
| "loss": 0.3369, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.6378947368421053, |
| "grad_norm": 3.2296813376833504, |
| "learning_rate": 7.693080007570084e-06, |
| "loss": 0.3226, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 2.951561052105942, |
| "learning_rate": 7.679133974894984e-06, |
| "loss": 0.2387, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.6421052631578947, |
| "grad_norm": 3.7736306036911005, |
| "learning_rate": 7.66515864363997e-06, |
| "loss": 0.2642, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.6442105263157895, |
| "grad_norm": 3.9684522413257417, |
| "learning_rate": 7.651154166637025e-06, |
| "loss": 0.3537, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.6463157894736842, |
| "grad_norm": 2.847959880083427, |
| "learning_rate": 7.637120697036866e-06, |
| "loss": 0.2129, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.6484210526315789, |
| "grad_norm": 3.3228471321798874, |
| "learning_rate": 7.62305838830727e-06, |
| "loss": 0.2872, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.6505263157894737, |
| "grad_norm": 3.226721545415034, |
| "learning_rate": 7.608967394231387e-06, |
| "loss": 0.3071, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.6526315789473685, |
| "grad_norm": 3.0776063116907038, |
| "learning_rate": 7.594847868906076e-06, |
| "loss": 0.2046, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6547368421052632, |
| "grad_norm": 3.035402994986961, |
| "learning_rate": 7.580699966740201e-06, |
| "loss": 0.2609, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.6568421052631579, |
| "grad_norm": 4.098144779390545, |
| "learning_rate": 7.566523842452958e-06, |
| "loss": 0.3306, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.6589473684210526, |
| "grad_norm": 2.8711609459832084, |
| "learning_rate": 7.552319651072164e-06, |
| "loss": 0.2736, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.6610526315789473, |
| "grad_norm": 3.534378429059425, |
| "learning_rate": 7.5380875479325855e-06, |
| "loss": 0.234, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.6631578947368421, |
| "grad_norm": 3.1761510007883835, |
| "learning_rate": 7.52382768867422e-06, |
| "loss": 0.2159, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.6652631578947369, |
| "grad_norm": 4.187380079118218, |
| "learning_rate": 7.509540229240601e-06, |
| "loss": 0.3721, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.6673684210526316, |
| "grad_norm": 3.1492384845261157, |
| "learning_rate": 7.4952253258771036e-06, |
| "loss": 0.2671, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.6694736842105263, |
| "grad_norm": 3.3725068433395866, |
| "learning_rate": 7.480883135129211e-06, |
| "loss": 0.2781, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.671578947368421, |
| "grad_norm": 2.880810684845612, |
| "learning_rate": 7.4665138138408255e-06, |
| "loss": 0.2399, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.6736842105263158, |
| "grad_norm": 3.675738307936544, |
| "learning_rate": 7.452117519152542e-06, |
| "loss": 0.2816, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6757894736842105, |
| "grad_norm": 3.556869112337663, |
| "learning_rate": 7.437694408499932e-06, |
| "loss": 0.2141, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.6778947368421052, |
| "grad_norm": 3.0077086336701737, |
| "learning_rate": 7.4232446396118265e-06, |
| "loss": 0.258, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.68, |
| "grad_norm": 3.8856684116292284, |
| "learning_rate": 7.408768370508577e-06, |
| "loss": 0.2716, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.6821052631578948, |
| "grad_norm": 2.704555689211984, |
| "learning_rate": 7.394265759500348e-06, |
| "loss": 0.1959, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.6842105263157895, |
| "grad_norm": 3.6831975327095794, |
| "learning_rate": 7.379736965185369e-06, |
| "loss": 0.2326, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.6863157894736842, |
| "grad_norm": 2.892505230264137, |
| "learning_rate": 7.365182146448205e-06, |
| "loss": 0.2145, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.6884210526315789, |
| "grad_norm": 2.913603216291662, |
| "learning_rate": 7.350601462458025e-06, |
| "loss": 0.2467, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.6905263157894737, |
| "grad_norm": 3.1224065548434314, |
| "learning_rate": 7.335995072666848e-06, |
| "loss": 0.2332, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.6926315789473684, |
| "grad_norm": 3.208885963759053, |
| "learning_rate": 7.3213631368078196e-06, |
| "loss": 0.2521, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.6947368421052632, |
| "grad_norm": 2.738726314664612, |
| "learning_rate": 7.30670581489344e-06, |
| "loss": 0.2332, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.6968421052631579, |
| "grad_norm": 3.697170501068047, |
| "learning_rate": 7.292023267213836e-06, |
| "loss": 0.2796, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.6989473684210527, |
| "grad_norm": 3.4594202559170832, |
| "learning_rate": 7.2773156543349965e-06, |
| "loss": 0.2845, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.7010526315789474, |
| "grad_norm": 2.819311848002194, |
| "learning_rate": 7.262583137097019e-06, |
| "loss": 0.2236, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.7031578947368421, |
| "grad_norm": 3.7286636270733102, |
| "learning_rate": 7.247825876612353e-06, |
| "loss": 0.3536, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.7052631578947368, |
| "grad_norm": 2.8883909376141936, |
| "learning_rate": 7.233044034264034e-06, |
| "loss": 0.2394, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.7073684210526315, |
| "grad_norm": 2.8405834985207123, |
| "learning_rate": 7.218237771703921e-06, |
| "loss": 0.2204, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.7094736842105264, |
| "grad_norm": 4.087265898865519, |
| "learning_rate": 7.203407250850929e-06, |
| "loss": 0.2904, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.7115789473684211, |
| "grad_norm": 2.47484433615118, |
| "learning_rate": 7.18855263388926e-06, |
| "loss": 0.1984, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.7136842105263158, |
| "grad_norm": 3.301930582984559, |
| "learning_rate": 7.173674083266624e-06, |
| "loss": 0.1802, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.7157894736842105, |
| "grad_norm": 2.779259357803785, |
| "learning_rate": 7.158771761692464e-06, |
| "loss": 0.2229, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7178947368421053, |
| "grad_norm": 3.144584289615483, |
| "learning_rate": 7.143845832136188e-06, |
| "loss": 0.2245, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.72, |
| "grad_norm": 3.4111423970994186, |
| "learning_rate": 7.128896457825364e-06, |
| "loss": 0.2389, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.7221052631578947, |
| "grad_norm": 3.7528655875468884, |
| "learning_rate": 7.113923802243957e-06, |
| "loss": 0.2757, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.7242105263157895, |
| "grad_norm": 2.695128263190076, |
| "learning_rate": 7.098928029130529e-06, |
| "loss": 0.2143, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.7263157894736842, |
| "grad_norm": 3.6700100764481247, |
| "learning_rate": 7.083909302476453e-06, |
| "loss": 0.3314, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.728421052631579, |
| "grad_norm": 3.079325282545559, |
| "learning_rate": 7.068867786524116e-06, |
| "loss": 0.2287, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.7305263157894737, |
| "grad_norm": 3.5393540118526383, |
| "learning_rate": 7.053803645765128e-06, |
| "loss": 0.2296, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.7326315789473684, |
| "grad_norm": 3.077685464826638, |
| "learning_rate": 7.038717044938519e-06, |
| "loss": 0.2433, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.7347368421052631, |
| "grad_norm": 3.2908130199128194, |
| "learning_rate": 7.023608149028936e-06, |
| "loss": 0.2678, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.7368421052631579, |
| "grad_norm": 3.466611772144543, |
| "learning_rate": 7.008477123264849e-06, |
| "loss": 0.2877, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7389473684210527, |
| "grad_norm": 2.6649586894139623, |
| "learning_rate": 6.993324133116726e-06, |
| "loss": 0.2341, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.7410526315789474, |
| "grad_norm": 2.9965647115149525, |
| "learning_rate": 6.978149344295242e-06, |
| "loss": 0.2468, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.7431578947368421, |
| "grad_norm": 3.3207502931977184, |
| "learning_rate": 6.9629529227494575e-06, |
| "loss": 0.2328, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.7452631578947368, |
| "grad_norm": 2.9140064494968216, |
| "learning_rate": 6.9477350346650016e-06, |
| "loss": 0.2592, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.7473684210526316, |
| "grad_norm": 3.29979844169393, |
| "learning_rate": 6.932495846462262e-06, |
| "loss": 0.2766, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.7494736842105263, |
| "grad_norm": 3.1444817547605175, |
| "learning_rate": 6.9172355247945586e-06, |
| "loss": 0.2483, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.751578947368421, |
| "grad_norm": 3.0389647906768222, |
| "learning_rate": 6.901954236546324e-06, |
| "loss": 0.244, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.7536842105263157, |
| "grad_norm": 3.881978525237369, |
| "learning_rate": 6.88665214883128e-06, |
| "loss": 0.3913, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.7557894736842106, |
| "grad_norm": 3.4375207455529604, |
| "learning_rate": 6.871329428990602e-06, |
| "loss": 0.2782, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.7578947368421053, |
| "grad_norm": 3.070535976817441, |
| "learning_rate": 6.855986244591104e-06, |
| "loss": 0.233, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 2.7760813127556343, |
| "learning_rate": 6.840622763423391e-06, |
| "loss": 0.2264, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.7621052631578947, |
| "grad_norm": 3.2637686834139297, |
| "learning_rate": 6.825239153500029e-06, |
| "loss": 0.2083, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.7642105263157895, |
| "grad_norm": 2.9278070407238457, |
| "learning_rate": 6.809835583053716e-06, |
| "loss": 0.1856, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.7663157894736842, |
| "grad_norm": 3.2629951336042695, |
| "learning_rate": 6.794412220535426e-06, |
| "loss": 0.2623, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.7684210526315789, |
| "grad_norm": 3.6433993183893927, |
| "learning_rate": 6.778969234612583e-06, |
| "loss": 0.279, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.7705263157894737, |
| "grad_norm": 2.6196051292482117, |
| "learning_rate": 6.763506794167207e-06, |
| "loss": 0.2303, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.7726315789473684, |
| "grad_norm": 3.494474145198592, |
| "learning_rate": 6.748025068294067e-06, |
| "loss": 0.2685, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.7747368421052632, |
| "grad_norm": 3.7734222764866043, |
| "learning_rate": 6.732524226298841e-06, |
| "loss": 0.2162, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.7768421052631579, |
| "grad_norm": 4.056775355673952, |
| "learning_rate": 6.717004437696249e-06, |
| "loss": 0.3167, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.7789473684210526, |
| "grad_norm": 3.1668706807914133, |
| "learning_rate": 6.701465872208216e-06, |
| "loss": 0.2249, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.7810526315789473, |
| "grad_norm": 3.522250191145657, |
| "learning_rate": 6.685908699762003e-06, |
| "loss": 0.245, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.783157894736842, |
| "grad_norm": 3.2561672462411044, |
| "learning_rate": 6.670333090488357e-06, |
| "loss": 0.2627, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.7852631578947369, |
| "grad_norm": 3.3885462690447468, |
| "learning_rate": 6.654739214719642e-06, |
| "loss": 0.248, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.7873684210526316, |
| "grad_norm": 2.48853753145028, |
| "learning_rate": 6.6391272429879886e-06, |
| "loss": 0.1883, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.7894736842105263, |
| "grad_norm": 2.8009606275161802, |
| "learning_rate": 6.6234973460234184e-06, |
| "loss": 0.2771, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.791578947368421, |
| "grad_norm": 2.9535272661326255, |
| "learning_rate": 6.607849694751978e-06, |
| "loss": 0.2588, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.7936842105263158, |
| "grad_norm": 2.6990032013425505, |
| "learning_rate": 6.592184460293878e-06, |
| "loss": 0.188, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.7957894736842105, |
| "grad_norm": 3.5549552239785998, |
| "learning_rate": 6.576501813961609e-06, |
| "loss": 0.2613, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.7978947368421052, |
| "grad_norm": 3.1945303123549795, |
| "learning_rate": 6.560801927258081e-06, |
| "loss": 0.2114, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 3.0825376124104857, |
| "learning_rate": 6.545084971874738e-06, |
| "loss": 0.1909, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8021052631578948, |
| "grad_norm": 3.1479328070094956, |
| "learning_rate": 6.529351119689687e-06, |
| "loss": 0.3183, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.8042105263157895, |
| "grad_norm": 2.6365167516964108, |
| "learning_rate": 6.513600542765816e-06, |
| "loss": 0.1805, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.8063157894736842, |
| "grad_norm": 2.826135221160075, |
| "learning_rate": 6.49783341334891e-06, |
| "loss": 0.2439, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.8084210526315789, |
| "grad_norm": 2.743253754387738, |
| "learning_rate": 6.4820499038657695e-06, |
| "loss": 0.168, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.8105263157894737, |
| "grad_norm": 2.809461612204882, |
| "learning_rate": 6.466250186922325e-06, |
| "loss": 0.2294, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.8126315789473684, |
| "grad_norm": 3.775735076866835, |
| "learning_rate": 6.450434435301751e-06, |
| "loss": 0.3232, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.8147368421052632, |
| "grad_norm": 3.5899045267489527, |
| "learning_rate": 6.434602821962571e-06, |
| "loss": 0.2821, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.8168421052631579, |
| "grad_norm": 3.078761874653056, |
| "learning_rate": 6.418755520036775e-06, |
| "loss": 0.2414, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.8189473684210526, |
| "grad_norm": 3.2185090152119704, |
| "learning_rate": 6.402892702827916e-06, |
| "loss": 0.197, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.8210526315789474, |
| "grad_norm": 2.778844684918169, |
| "learning_rate": 6.387014543809224e-06, |
| "loss": 0.2411, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8231578947368421, |
| "grad_norm": 2.9779712090916566, |
| "learning_rate": 6.371121216621698e-06, |
| "loss": 0.2405, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.8252631578947368, |
| "grad_norm": 3.0589344259681575, |
| "learning_rate": 6.355212895072223e-06, |
| "loss": 0.2662, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.8273684210526315, |
| "grad_norm": 3.044689736554792, |
| "learning_rate": 6.339289753131649e-06, |
| "loss": 0.2052, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.8294736842105264, |
| "grad_norm": 2.909939485878581, |
| "learning_rate": 6.323351964932909e-06, |
| "loss": 0.2637, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.8315789473684211, |
| "grad_norm": 3.7255183082841206, |
| "learning_rate": 6.3073997047691e-06, |
| "loss": 0.3135, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.8336842105263158, |
| "grad_norm": 3.09956930432557, |
| "learning_rate": 6.291433147091583e-06, |
| "loss": 0.2073, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.8357894736842105, |
| "grad_norm": 3.208752314181852, |
| "learning_rate": 6.275452466508076e-06, |
| "loss": 0.2369, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.8378947368421052, |
| "grad_norm": 3.332585956508526, |
| "learning_rate": 6.259457837780741e-06, |
| "loss": 0.2653, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 3.712274323531537, |
| "learning_rate": 6.243449435824276e-06, |
| "loss": 0.299, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.8421052631578947, |
| "grad_norm": 3.4787692263707193, |
| "learning_rate": 6.227427435703997e-06, |
| "loss": 0.2623, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8421052631578947, |
| "eval_loss": 0.22630169987678528, |
| "eval_runtime": 0.9241, |
| "eval_samples_per_second": 42.203, |
| "eval_steps_per_second": 10.821, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8442105263157895, |
| "grad_norm": 3.251478421854529, |
| "learning_rate": 6.211392012633932e-06, |
| "loss": 0.2174, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.8463157894736842, |
| "grad_norm": 3.5725773836009496, |
| "learning_rate": 6.1953433419748995e-06, |
| "loss": 0.3207, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.848421052631579, |
| "grad_norm": 2.678551532523693, |
| "learning_rate": 6.179281599232592e-06, |
| "loss": 0.2134, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.8505263157894737, |
| "grad_norm": 2.930489162104918, |
| "learning_rate": 6.163206960055652e-06, |
| "loss": 0.234, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.8526315789473684, |
| "grad_norm": 3.3941005641549373, |
| "learning_rate": 6.147119600233758e-06, |
| "loss": 0.2361, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.8547368421052631, |
| "grad_norm": 3.4358663312804594, |
| "learning_rate": 6.131019695695702e-06, |
| "loss": 0.3, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.8568421052631578, |
| "grad_norm": 3.3210473494075443, |
| "learning_rate": 6.114907422507459e-06, |
| "loss": 0.2277, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.8589473684210527, |
| "grad_norm": 3.397617904327594, |
| "learning_rate": 6.098782956870266e-06, |
| "loss": 0.2644, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.8610526315789474, |
| "grad_norm": 3.3393246989566787, |
| "learning_rate": 6.0826464751187e-06, |
| "loss": 0.2439, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.8631578947368421, |
| "grad_norm": 3.8491410481959227, |
| "learning_rate": 6.066498153718735e-06, |
| "loss": 0.2582, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.8652631578947368, |
| "grad_norm": 2.7791598243883193, |
| "learning_rate": 6.0503381692658305e-06, |
| "loss": 0.2287, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.8673684210526316, |
| "grad_norm": 3.6422120621092957, |
| "learning_rate": 6.034166698482984e-06, |
| "loss": 0.3196, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.8694736842105263, |
| "grad_norm": 3.1317609570704787, |
| "learning_rate": 6.0179839182188125e-06, |
| "loss": 0.1853, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.871578947368421, |
| "grad_norm": 2.7141515622263257, |
| "learning_rate": 6.001790005445607e-06, |
| "loss": 0.1998, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.8736842105263158, |
| "grad_norm": 3.1598775704943836, |
| "learning_rate": 5.985585137257401e-06, |
| "loss": 0.2632, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.8757894736842106, |
| "grad_norm": 3.446573016898305, |
| "learning_rate": 5.969369490868042e-06, |
| "loss": 0.2501, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.8778947368421053, |
| "grad_norm": 3.0594511196593444, |
| "learning_rate": 5.953143243609235e-06, |
| "loss": 0.2514, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.88, |
| "grad_norm": 3.2145264513666905, |
| "learning_rate": 5.936906572928625e-06, |
| "loss": 0.2655, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.8821052631578947, |
| "grad_norm": 3.3756141872799645, |
| "learning_rate": 5.920659656387836e-06, |
| "loss": 0.2525, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.8842105263157894, |
| "grad_norm": 2.758856115220705, |
| "learning_rate": 5.904402671660551e-06, |
| "loss": 0.2103, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.8863157894736842, |
| "grad_norm": 3.2338348283135, |
| "learning_rate": 5.8881357965305444e-06, |
| "loss": 0.2543, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.888421052631579, |
| "grad_norm": 2.898358500545659, |
| "learning_rate": 5.871859208889759e-06, |
| "loss": 0.2117, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.8905263157894737, |
| "grad_norm": 3.0693690724990064, |
| "learning_rate": 5.855573086736351e-06, |
| "loss": 0.228, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.8926315789473684, |
| "grad_norm": 3.22975196387293, |
| "learning_rate": 5.839277608172739e-06, |
| "loss": 0.2577, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.8947368421052632, |
| "grad_norm": 4.392345175921754, |
| "learning_rate": 5.82297295140367e-06, |
| "loss": 0.3319, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.8968421052631579, |
| "grad_norm": 3.35468992972689, |
| "learning_rate": 5.806659294734256e-06, |
| "loss": 0.2141, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.8989473684210526, |
| "grad_norm": 3.2440283643656262, |
| "learning_rate": 5.790336816568033e-06, |
| "loss": 0.2201, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.9010526315789473, |
| "grad_norm": 2.935911272760269, |
| "learning_rate": 5.774005695405008e-06, |
| "loss": 0.2017, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.9031578947368422, |
| "grad_norm": 3.1890795129615994, |
| "learning_rate": 5.7576661098397024e-06, |
| "loss": 0.2918, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.9052631578947369, |
| "grad_norm": 3.500288661868837, |
| "learning_rate": 5.74131823855921e-06, |
| "loss": 0.2761, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9073684210526316, |
| "grad_norm": 3.414156857820881, |
| "learning_rate": 5.72496226034123e-06, |
| "loss": 0.2257, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.9094736842105263, |
| "grad_norm": 3.225368384638819, |
| "learning_rate": 5.708598354052122e-06, |
| "loss": 0.2721, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.911578947368421, |
| "grad_norm": 3.5718042208512375, |
| "learning_rate": 5.692226698644938e-06, |
| "loss": 0.2247, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.9136842105263158, |
| "grad_norm": 2.5936514914444047, |
| "learning_rate": 5.675847473157485e-06, |
| "loss": 0.215, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.9157894736842105, |
| "grad_norm": 3.8268265538198327, |
| "learning_rate": 5.659460856710346e-06, |
| "loss": 0.2675, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.9178947368421052, |
| "grad_norm": 2.591405308225686, |
| "learning_rate": 5.643067028504931e-06, |
| "loss": 0.1786, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.92, |
| "grad_norm": 3.098466939386034, |
| "learning_rate": 5.626666167821522e-06, |
| "loss": 0.205, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.9221052631578948, |
| "grad_norm": 3.1761782292767564, |
| "learning_rate": 5.610258454017301e-06, |
| "loss": 0.2534, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.9242105263157895, |
| "grad_norm": 2.9386193495829795, |
| "learning_rate": 5.593844066524401e-06, |
| "loss": 0.2702, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.9263157894736842, |
| "grad_norm": 3.2355731219150394, |
| "learning_rate": 5.577423184847932e-06, |
| "loss": 0.2768, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9284210526315789, |
| "grad_norm": 3.253238208386654, |
| "learning_rate": 5.560995988564023e-06, |
| "loss": 0.2728, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.9305263157894736, |
| "grad_norm": 4.1574139012640225, |
| "learning_rate": 5.544562657317863e-06, |
| "loss": 0.2815, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.9326315789473684, |
| "grad_norm": 3.4530558929645663, |
| "learning_rate": 5.52812337082173e-06, |
| "loss": 0.2517, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.9347368421052632, |
| "grad_norm": 3.034360151981781, |
| "learning_rate": 5.5116783088530255e-06, |
| "loss": 0.2124, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.9368421052631579, |
| "grad_norm": 3.4312547440487333, |
| "learning_rate": 5.495227651252315e-06, |
| "loss": 0.2977, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.9389473684210526, |
| "grad_norm": 2.855876800810897, |
| "learning_rate": 5.478771577921351e-06, |
| "loss": 0.2099, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.9410526315789474, |
| "grad_norm": 3.332726274404472, |
| "learning_rate": 5.4623102688211186e-06, |
| "loss": 0.2924, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.9431578947368421, |
| "grad_norm": 4.407070803673364, |
| "learning_rate": 5.445843903969854e-06, |
| "loss": 0.2892, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.9452631578947368, |
| "grad_norm": 3.0253661541082875, |
| "learning_rate": 5.429372663441086e-06, |
| "loss": 0.2156, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.9473684210526315, |
| "grad_norm": 2.8412635352832187, |
| "learning_rate": 5.412896727361663e-06, |
| "loss": 0.2284, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9494736842105264, |
| "grad_norm": 3.5868259684516075, |
| "learning_rate": 5.396416275909779e-06, |
| "loss": 0.2759, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.9515789473684211, |
| "grad_norm": 2.996265256795479, |
| "learning_rate": 5.379931489313016e-06, |
| "loss": 0.2234, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.9536842105263158, |
| "grad_norm": 2.6413020642647242, |
| "learning_rate": 5.363442547846356e-06, |
| "loss": 0.1867, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.9557894736842105, |
| "grad_norm": 2.989468542441537, |
| "learning_rate": 5.346949631830221e-06, |
| "loss": 0.2212, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.9578947368421052, |
| "grad_norm": 3.517740282273809, |
| "learning_rate": 5.3304529216284974e-06, |
| "loss": 0.322, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 2.6529272773403485, |
| "learning_rate": 5.3139525976465675e-06, |
| "loss": 0.1765, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.9621052631578947, |
| "grad_norm": 3.682108270044108, |
| "learning_rate": 5.2974488403293285e-06, |
| "loss": 0.2842, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.9642105263157895, |
| "grad_norm": 3.020601772073395, |
| "learning_rate": 5.280941830159228e-06, |
| "loss": 0.2485, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.9663157894736842, |
| "grad_norm": 2.7942998887866453, |
| "learning_rate": 5.264431747654284e-06, |
| "loss": 0.191, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.968421052631579, |
| "grad_norm": 3.24583853868691, |
| "learning_rate": 5.247918773366112e-06, |
| "loss": 0.2327, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.9705263157894737, |
| "grad_norm": 2.8171630730122534, |
| "learning_rate": 5.231403087877955e-06, |
| "loss": 0.2234, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.9726315789473684, |
| "grad_norm": 2.8270485667195544, |
| "learning_rate": 5.214884871802703e-06, |
| "loss": 0.2309, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.9747368421052631, |
| "grad_norm": 3.2640422013428587, |
| "learning_rate": 5.198364305780922e-06, |
| "loss": 0.2152, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.9768421052631578, |
| "grad_norm": 2.8640531088651375, |
| "learning_rate": 5.1818415704788725e-06, |
| "loss": 0.2366, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.9789473684210527, |
| "grad_norm": 2.842698759687521, |
| "learning_rate": 5.165316846586541e-06, |
| "loss": 0.2241, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.9810526315789474, |
| "grad_norm": 2.4466969685896465, |
| "learning_rate": 5.148790314815662e-06, |
| "loss": 0.1975, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.9831578947368421, |
| "grad_norm": 3.043987452889092, |
| "learning_rate": 5.132262155897739e-06, |
| "loss": 0.2044, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.9852631578947368, |
| "grad_norm": 3.93638285635615, |
| "learning_rate": 5.11573255058207e-06, |
| "loss": 0.2927, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.9873684210526316, |
| "grad_norm": 3.1860838277038224, |
| "learning_rate": 5.099201679633769e-06, |
| "loss": 0.3051, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.9894736842105263, |
| "grad_norm": 2.6486537512870725, |
| "learning_rate": 5.082669723831793e-06, |
| "loss": 0.1287, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.991578947368421, |
| "grad_norm": 3.022725909103708, |
| "learning_rate": 5.066136863966963e-06, |
| "loss": 0.2423, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.9936842105263158, |
| "grad_norm": 3.2304036278113095, |
| "learning_rate": 5.049603280839982e-06, |
| "loss": 0.2274, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.9957894736842106, |
| "grad_norm": 2.5800916231325393, |
| "learning_rate": 5.033069155259471e-06, |
| "loss": 0.2118, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.9978947368421053, |
| "grad_norm": 2.3680672194474335, |
| "learning_rate": 5.016534668039976e-06, |
| "loss": 0.1445, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 2.8673882894664535, |
| "learning_rate": 5e-06, |
| "loss": 0.2295, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.0021052631578948, |
| "grad_norm": 2.295670347098231, |
| "learning_rate": 4.983465331960025e-06, |
| "loss": 0.0976, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.0042105263157894, |
| "grad_norm": 2.2098086978202747, |
| "learning_rate": 4.96693084474053e-06, |
| "loss": 0.1486, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.0063157894736843, |
| "grad_norm": 2.2844750468890185, |
| "learning_rate": 4.950396719160019e-06, |
| "loss": 0.1171, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.0084210526315789, |
| "grad_norm": 2.6215587379391407, |
| "learning_rate": 4.93386313603304e-06, |
| "loss": 0.1056, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.0105263157894737, |
| "grad_norm": 2.7267133338007685, |
| "learning_rate": 4.917330276168208e-06, |
| "loss": 0.1602, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.0126315789473683, |
| "grad_norm": 2.3440015356753774, |
| "learning_rate": 4.900798320366233e-06, |
| "loss": 0.1126, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.0147368421052632, |
| "grad_norm": 2.052176267404225, |
| "learning_rate": 4.884267449417932e-06, |
| "loss": 0.0856, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.016842105263158, |
| "grad_norm": 2.773326388210988, |
| "learning_rate": 4.867737844102261e-06, |
| "loss": 0.1343, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.0189473684210526, |
| "grad_norm": 2.3704172545807984, |
| "learning_rate": 4.851209685184339e-06, |
| "loss": 0.0955, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.0210526315789474, |
| "grad_norm": 2.693276477530952, |
| "learning_rate": 4.8346831534134595e-06, |
| "loss": 0.1107, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.023157894736842, |
| "grad_norm": 2.5177397539501007, |
| "learning_rate": 4.818158429521129e-06, |
| "loss": 0.1256, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.0252631578947369, |
| "grad_norm": 2.76258073472734, |
| "learning_rate": 4.801635694219079e-06, |
| "loss": 0.0977, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.0273684210526315, |
| "grad_norm": 2.745546482466762, |
| "learning_rate": 4.785115128197298e-06, |
| "loss": 0.121, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.0294736842105263, |
| "grad_norm": 2.8689928800544413, |
| "learning_rate": 4.768596912122046e-06, |
| "loss": 0.1043, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.0315789473684212, |
| "grad_norm": 3.2713014779449185, |
| "learning_rate": 4.752081226633888e-06, |
| "loss": 0.1329, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.0336842105263158, |
| "grad_norm": 2.6940653620000843, |
| "learning_rate": 4.735568252345718e-06, |
| "loss": 0.0763, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.0357894736842106, |
| "grad_norm": 2.7748381875190065, |
| "learning_rate": 4.719058169840773e-06, |
| "loss": 0.1041, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.0378947368421052, |
| "grad_norm": 2.96405722060865, |
| "learning_rate": 4.702551159670672e-06, |
| "loss": 0.1081, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.04, |
| "grad_norm": 3.2702432569124644, |
| "learning_rate": 4.686047402353433e-06, |
| "loss": 0.1033, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.0421052631578946, |
| "grad_norm": 3.1378226854518765, |
| "learning_rate": 4.669547078371503e-06, |
| "loss": 0.1203, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.0442105263157895, |
| "grad_norm": 3.0459911779334723, |
| "learning_rate": 4.65305036816978e-06, |
| "loss": 0.1177, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.0463157894736843, |
| "grad_norm": 3.039349255011064, |
| "learning_rate": 4.636557452153645e-06, |
| "loss": 0.0716, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.048421052631579, |
| "grad_norm": 3.1967200174187647, |
| "learning_rate": 4.620068510686985e-06, |
| "loss": 0.0955, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.0505263157894738, |
| "grad_norm": 3.08653598155148, |
| "learning_rate": 4.60358372409022e-06, |
| "loss": 0.1073, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.0526315789473684, |
| "grad_norm": 3.608351560740164, |
| "learning_rate": 4.587103272638339e-06, |
| "loss": 0.1192, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.0547368421052632, |
| "grad_norm": 2.985044729505937, |
| "learning_rate": 4.570627336558915e-06, |
| "loss": 0.1077, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.0568421052631578, |
| "grad_norm": 2.3865033858620714, |
| "learning_rate": 4.554156096030149e-06, |
| "loss": 0.069, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.0589473684210526, |
| "grad_norm": 2.9290127417667318, |
| "learning_rate": 4.537689731178883e-06, |
| "loss": 0.0982, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.0610526315789475, |
| "grad_norm": 3.8440200841289887, |
| "learning_rate": 4.5212284220786495e-06, |
| "loss": 0.1302, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.063157894736842, |
| "grad_norm": 2.8853341217614283, |
| "learning_rate": 4.504772348747687e-06, |
| "loss": 0.1145, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.065263157894737, |
| "grad_norm": 2.6679058498506545, |
| "learning_rate": 4.488321691146975e-06, |
| "loss": 0.0904, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.0673684210526315, |
| "grad_norm": 2.8951003825916075, |
| "learning_rate": 4.471876629178273e-06, |
| "loss": 0.103, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.0694736842105264, |
| "grad_norm": 3.007348462141689, |
| "learning_rate": 4.4554373426821375e-06, |
| "loss": 0.099, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.071578947368421, |
| "grad_norm": 3.328459980789624, |
| "learning_rate": 4.439004011435979e-06, |
| "loss": 0.1205, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.0736842105263158, |
| "grad_norm": 2.949855877718661, |
| "learning_rate": 4.42257681515207e-06, |
| "loss": 0.0874, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.0757894736842106, |
| "grad_norm": 3.6391199686370443, |
| "learning_rate": 4.406155933475599e-06, |
| "loss": 0.1249, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.0778947368421052, |
| "grad_norm": 3.2478829168809926, |
| "learning_rate": 4.3897415459827e-06, |
| "loss": 0.1167, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 2.841083467241702, |
| "learning_rate": 4.373333832178478e-06, |
| "loss": 0.1017, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.0821052631578947, |
| "grad_norm": 3.3982868679317044, |
| "learning_rate": 4.356932971495071e-06, |
| "loss": 0.0945, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.0842105263157895, |
| "grad_norm": 2.890261480563195, |
| "learning_rate": 4.340539143289655e-06, |
| "loss": 0.1087, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.0863157894736841, |
| "grad_norm": 3.308334318534236, |
| "learning_rate": 4.324152526842517e-06, |
| "loss": 0.0905, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.088421052631579, |
| "grad_norm": 2.8073590228296204, |
| "learning_rate": 4.307773301355063e-06, |
| "loss": 0.1182, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.0905263157894738, |
| "grad_norm": 2.9727549576474557, |
| "learning_rate": 4.291401645947879e-06, |
| "loss": 0.1169, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.0926315789473684, |
| "grad_norm": 2.7817050820561464, |
| "learning_rate": 4.275037739658771e-06, |
| "loss": 0.1098, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.0947368421052632, |
| "grad_norm": 2.3497572335814225, |
| "learning_rate": 4.25868176144079e-06, |
| "loss": 0.0562, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.0968421052631578, |
| "grad_norm": 3.3243494735102814, |
| "learning_rate": 4.242333890160299e-06, |
| "loss": 0.1095, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.0989473684210527, |
| "grad_norm": 2.6231886257195915, |
| "learning_rate": 4.225994304594994e-06, |
| "loss": 0.0896, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.1010526315789473, |
| "grad_norm": 3.0525576073188803, |
| "learning_rate": 4.209663183431969e-06, |
| "loss": 0.123, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.1031578947368421, |
| "grad_norm": 2.300663876671176, |
| "learning_rate": 4.193340705265746e-06, |
| "loss": 0.0967, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.1052631578947367, |
| "grad_norm": 2.6473318145668077, |
| "learning_rate": 4.17702704859633e-06, |
| "loss": 0.0812, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.1073684210526316, |
| "grad_norm": 3.1969088912792007, |
| "learning_rate": 4.160722391827262e-06, |
| "loss": 0.1204, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.1094736842105264, |
| "grad_norm": 3.001046456322388, |
| "learning_rate": 4.14442691326365e-06, |
| "loss": 0.0939, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.111578947368421, |
| "grad_norm": 2.992113792126852, |
| "learning_rate": 4.128140791110243e-06, |
| "loss": 0.0933, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.1136842105263158, |
| "grad_norm": 2.873261259224336, |
| "learning_rate": 4.111864203469457e-06, |
| "loss": 0.113, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.1157894736842104, |
| "grad_norm": 2.227852704764954, |
| "learning_rate": 4.0955973283394525e-06, |
| "loss": 0.0822, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.1178947368421053, |
| "grad_norm": 2.503374847865978, |
| "learning_rate": 4.079340343612165e-06, |
| "loss": 0.1093, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.12, |
| "grad_norm": 2.867424738355881, |
| "learning_rate": 4.063093427071376e-06, |
| "loss": 0.0901, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.1221052631578947, |
| "grad_norm": 2.9247925764549842, |
| "learning_rate": 4.046856756390767e-06, |
| "loss": 0.0914, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.1242105263157895, |
| "grad_norm": 3.122299881486441, |
| "learning_rate": 4.03063050913196e-06, |
| "loss": 0.0874, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.1263157894736842, |
| "grad_norm": 2.4666341115474673, |
| "learning_rate": 4.0144148627426e-06, |
| "loss": 0.092, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.128421052631579, |
| "grad_norm": 2.601382923998426, |
| "learning_rate": 3.998209994554395e-06, |
| "loss": 0.0896, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.1305263157894736, |
| "grad_norm": 3.3371781721212095, |
| "learning_rate": 3.982016081781189e-06, |
| "loss": 0.1336, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.1326315789473684, |
| "grad_norm": 3.2001419892406666, |
| "learning_rate": 3.965833301517017e-06, |
| "loss": 0.0934, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.134736842105263, |
| "grad_norm": 2.971320892945007, |
| "learning_rate": 3.949661830734172e-06, |
| "loss": 0.1143, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.1368421052631579, |
| "grad_norm": 2.710169633405829, |
| "learning_rate": 3.9335018462812664e-06, |
| "loss": 0.0817, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.1389473684210527, |
| "grad_norm": 3.102427528491057, |
| "learning_rate": 3.9173535248813026e-06, |
| "loss": 0.09, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.1410526315789473, |
| "grad_norm": 2.3838774593905443, |
| "learning_rate": 3.901217043129735e-06, |
| "loss": 0.0843, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.1431578947368422, |
| "grad_norm": 3.195089972641954, |
| "learning_rate": 3.885092577492543e-06, |
| "loss": 0.1437, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.1452631578947368, |
| "grad_norm": 2.9164511963016713, |
| "learning_rate": 3.8689803043043e-06, |
| "loss": 0.1064, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.1473684210526316, |
| "grad_norm": 2.7560183158584914, |
| "learning_rate": 3.852880399766243e-06, |
| "loss": 0.0975, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.1494736842105264, |
| "grad_norm": 2.743001182985708, |
| "learning_rate": 3.8367930399443495e-06, |
| "loss": 0.1093, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.151578947368421, |
| "grad_norm": 2.717089115263253, |
| "learning_rate": 3.820718400767409e-06, |
| "loss": 0.0735, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.1536842105263159, |
| "grad_norm": 3.4175913700020626, |
| "learning_rate": 3.8046566580251e-06, |
| "loss": 0.1101, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.1557894736842105, |
| "grad_norm": 3.787166948611024, |
| "learning_rate": 3.7886079873660693e-06, |
| "loss": 0.1461, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.1578947368421053, |
| "grad_norm": 2.668580712780857, |
| "learning_rate": 3.7725725642960047e-06, |
| "loss": 0.0858, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.16, |
| "grad_norm": 2.7065005668874607, |
| "learning_rate": 3.756550564175727e-06, |
| "loss": 0.0695, |
| "step": 551 |
| }, |
| { |
| "epoch": 1.1621052631578948, |
| "grad_norm": 2.6470645824370354, |
| "learning_rate": 3.7405421622192607e-06, |
| "loss": 0.108, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.1642105263157894, |
| "grad_norm": 3.2629124702164582, |
| "learning_rate": 3.7245475334919246e-06, |
| "loss": 0.1309, |
| "step": 553 |
| }, |
| { |
| "epoch": 1.1663157894736842, |
| "grad_norm": 2.9270566638475564, |
| "learning_rate": 3.7085668529084183e-06, |
| "loss": 0.076, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.168421052631579, |
| "grad_norm": 2.9549638887471854, |
| "learning_rate": 3.6926002952309015e-06, |
| "loss": 0.0912, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.1705263157894736, |
| "grad_norm": 2.920345651586514, |
| "learning_rate": 3.676648035067093e-06, |
| "loss": 0.0958, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.1726315789473685, |
| "grad_norm": 3.7647197204198126, |
| "learning_rate": 3.6607102468683524e-06, |
| "loss": 0.104, |
| "step": 557 |
| }, |
| { |
| "epoch": 1.174736842105263, |
| "grad_norm": 2.25055864611599, |
| "learning_rate": 3.64478710492778e-06, |
| "loss": 0.0596, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.176842105263158, |
| "grad_norm": 2.3389330498436447, |
| "learning_rate": 3.628878783378302e-06, |
| "loss": 0.0952, |
| "step": 559 |
| }, |
| { |
| "epoch": 1.1789473684210527, |
| "grad_norm": 3.2077006313008947, |
| "learning_rate": 3.6129854561907786e-06, |
| "loss": 0.0962, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.1810526315789474, |
| "grad_norm": 3.0715551570828863, |
| "learning_rate": 3.5971072971720844e-06, |
| "loss": 0.1206, |
| "step": 561 |
| }, |
| { |
| "epoch": 1.1831578947368422, |
| "grad_norm": 3.6249545529720097, |
| "learning_rate": 3.581244479963225e-06, |
| "loss": 0.1172, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.1852631578947368, |
| "grad_norm": 3.196135915761518, |
| "learning_rate": 3.56539717803743e-06, |
| "loss": 0.1041, |
| "step": 563 |
| }, |
| { |
| "epoch": 1.1873684210526316, |
| "grad_norm": 3.151854573878974, |
| "learning_rate": 3.5495655646982506e-06, |
| "loss": 0.1152, |
| "step": 564 |
| }, |
| { |
| "epoch": 1.1894736842105262, |
| "grad_norm": 2.684765122673656, |
| "learning_rate": 3.533749813077677e-06, |
| "loss": 0.1038, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.191578947368421, |
| "grad_norm": 2.4791366806308277, |
| "learning_rate": 3.517950096134232e-06, |
| "loss": 0.0895, |
| "step": 566 |
| }, |
| { |
| "epoch": 1.1936842105263157, |
| "grad_norm": 2.4823858121020574, |
| "learning_rate": 3.5021665866510924e-06, |
| "loss": 0.0833, |
| "step": 567 |
| }, |
| { |
| "epoch": 1.1957894736842105, |
| "grad_norm": 2.465078895228644, |
| "learning_rate": 3.4863994572341845e-06, |
| "loss": 0.0962, |
| "step": 568 |
| }, |
| { |
| "epoch": 1.1978947368421053, |
| "grad_norm": 2.6695037410176767, |
| "learning_rate": 3.470648880310313e-06, |
| "loss": 0.0765, |
| "step": 569 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 3.230727181652887, |
| "learning_rate": 3.4549150281252635e-06, |
| "loss": 0.1003, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.2021052631578948, |
| "grad_norm": 2.6749391293551943, |
| "learning_rate": 3.4391980727419206e-06, |
| "loss": 0.0825, |
| "step": 571 |
| }, |
| { |
| "epoch": 1.2042105263157894, |
| "grad_norm": 3.1995780646497654, |
| "learning_rate": 3.423498186038393e-06, |
| "loss": 0.095, |
| "step": 572 |
| }, |
| { |
| "epoch": 1.2063157894736842, |
| "grad_norm": 3.05943838476226, |
| "learning_rate": 3.4078155397061243e-06, |
| "loss": 0.0718, |
| "step": 573 |
| }, |
| { |
| "epoch": 1.208421052631579, |
| "grad_norm": 2.5246593838638285, |
| "learning_rate": 3.3921503052480243e-06, |
| "loss": 0.0689, |
| "step": 574 |
| }, |
| { |
| "epoch": 1.2105263157894737, |
| "grad_norm": 2.5885059878463297, |
| "learning_rate": 3.3765026539765832e-06, |
| "loss": 0.1036, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.2126315789473685, |
| "grad_norm": 3.771547100866626, |
| "learning_rate": 3.3608727570120114e-06, |
| "loss": 0.1341, |
| "step": 576 |
| }, |
| { |
| "epoch": 1.2147368421052631, |
| "grad_norm": 2.88823941459496, |
| "learning_rate": 3.3452607852803585e-06, |
| "loss": 0.1221, |
| "step": 577 |
| }, |
| { |
| "epoch": 1.216842105263158, |
| "grad_norm": 2.5098432758853058, |
| "learning_rate": 3.3296669095116454e-06, |
| "loss": 0.0793, |
| "step": 578 |
| }, |
| { |
| "epoch": 1.2189473684210526, |
| "grad_norm": 3.173791392616761, |
| "learning_rate": 3.3140913002379993e-06, |
| "loss": 0.0924, |
| "step": 579 |
| }, |
| { |
| "epoch": 1.2210526315789474, |
| "grad_norm": 2.9056096452656655, |
| "learning_rate": 3.298534127791785e-06, |
| "loss": 0.1232, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.223157894736842, |
| "grad_norm": 2.7600704251604133, |
| "learning_rate": 3.2829955623037536e-06, |
| "loss": 0.1011, |
| "step": 581 |
| }, |
| { |
| "epoch": 1.2252631578947368, |
| "grad_norm": 2.8902773903565824, |
| "learning_rate": 3.267475773701161e-06, |
| "loss": 0.0971, |
| "step": 582 |
| }, |
| { |
| "epoch": 1.2273684210526317, |
| "grad_norm": 2.7682434724339284, |
| "learning_rate": 3.251974931705933e-06, |
| "loss": 0.0863, |
| "step": 583 |
| }, |
| { |
| "epoch": 1.2294736842105263, |
| "grad_norm": 2.7630463201120534, |
| "learning_rate": 3.236493205832795e-06, |
| "loss": 0.0691, |
| "step": 584 |
| }, |
| { |
| "epoch": 1.231578947368421, |
| "grad_norm": 2.769102376716963, |
| "learning_rate": 3.2210307653874175e-06, |
| "loss": 0.069, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.2336842105263157, |
| "grad_norm": 2.624470281681522, |
| "learning_rate": 3.205587779464576e-06, |
| "loss": 0.1175, |
| "step": 586 |
| }, |
| { |
| "epoch": 1.2357894736842105, |
| "grad_norm": 3.1237062690477515, |
| "learning_rate": 3.1901644169462854e-06, |
| "loss": 0.1124, |
| "step": 587 |
| }, |
| { |
| "epoch": 1.2378947368421054, |
| "grad_norm": 2.880688626388798, |
| "learning_rate": 3.1747608464999723e-06, |
| "loss": 0.0824, |
| "step": 588 |
| }, |
| { |
| "epoch": 1.24, |
| "grad_norm": 4.407201987822293, |
| "learning_rate": 3.1593772365766107e-06, |
| "loss": 0.1062, |
| "step": 589 |
| }, |
| { |
| "epoch": 1.2421052631578948, |
| "grad_norm": 2.87359540180944, |
| "learning_rate": 3.1440137554088957e-06, |
| "loss": 0.1029, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.2442105263157894, |
| "grad_norm": 2.9419016713556503, |
| "learning_rate": 3.128670571009399e-06, |
| "loss": 0.0943, |
| "step": 591 |
| }, |
| { |
| "epoch": 1.2463157894736843, |
| "grad_norm": 2.9623583273834977, |
| "learning_rate": 3.1133478511687217e-06, |
| "loss": 0.083, |
| "step": 592 |
| }, |
| { |
| "epoch": 1.2484210526315789, |
| "grad_norm": 2.4484737136573056, |
| "learning_rate": 3.0980457634536775e-06, |
| "loss": 0.0697, |
| "step": 593 |
| }, |
| { |
| "epoch": 1.2505263157894737, |
| "grad_norm": 2.983258535236656, |
| "learning_rate": 3.082764475205442e-06, |
| "loss": 0.0895, |
| "step": 594 |
| }, |
| { |
| "epoch": 1.2526315789473683, |
| "grad_norm": 2.927679313524382, |
| "learning_rate": 3.06750415353774e-06, |
| "loss": 0.1269, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.2547368421052632, |
| "grad_norm": 4.327826033733036, |
| "learning_rate": 3.052264965335e-06, |
| "loss": 0.1779, |
| "step": 596 |
| }, |
| { |
| "epoch": 1.256842105263158, |
| "grad_norm": 3.513787096535581, |
| "learning_rate": 3.0370470772505433e-06, |
| "loss": 0.1133, |
| "step": 597 |
| }, |
| { |
| "epoch": 1.2589473684210526, |
| "grad_norm": 2.1372319523914785, |
| "learning_rate": 3.02185065570476e-06, |
| "loss": 0.0641, |
| "step": 598 |
| }, |
| { |
| "epoch": 1.2610526315789474, |
| "grad_norm": 3.136784939810971, |
| "learning_rate": 3.0066758668832752e-06, |
| "loss": 0.1238, |
| "step": 599 |
| }, |
| { |
| "epoch": 1.263157894736842, |
| "grad_norm": 2.7814275077914346, |
| "learning_rate": 2.991522876735154e-06, |
| "loss": 0.106, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.263157894736842, |
| "eval_loss": 0.23879918456077576, |
| "eval_runtime": 0.9256, |
| "eval_samples_per_second": 42.133, |
| "eval_steps_per_second": 10.803, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.2652631578947369, |
| "grad_norm": 2.4532843992196898, |
| "learning_rate": 2.9763918509710647e-06, |
| "loss": 0.0901, |
| "step": 601 |
| }, |
| { |
| "epoch": 1.2673684210526317, |
| "grad_norm": 3.1129983606325835, |
| "learning_rate": 2.9612829550614836e-06, |
| "loss": 0.0836, |
| "step": 602 |
| }, |
| { |
| "epoch": 1.2694736842105263, |
| "grad_norm": 2.9949558761740893, |
| "learning_rate": 2.9461963542348737e-06, |
| "loss": 0.1009, |
| "step": 603 |
| }, |
| { |
| "epoch": 1.271578947368421, |
| "grad_norm": 3.5141135708573157, |
| "learning_rate": 2.931132213475884e-06, |
| "loss": 0.1061, |
| "step": 604 |
| }, |
| { |
| "epoch": 1.2736842105263158, |
| "grad_norm": 2.4903783090106293, |
| "learning_rate": 2.9160906975235493e-06, |
| "loss": 0.085, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.2757894736842106, |
| "grad_norm": 2.665551851720217, |
| "learning_rate": 2.9010719708694724e-06, |
| "loss": 0.0795, |
| "step": 606 |
| }, |
| { |
| "epoch": 1.2778947368421052, |
| "grad_norm": 2.251800484141915, |
| "learning_rate": 2.8860761977560435e-06, |
| "loss": 0.0663, |
| "step": 607 |
| }, |
| { |
| "epoch": 1.28, |
| "grad_norm": 2.7348175822860044, |
| "learning_rate": 2.871103542174637e-06, |
| "loss": 0.0978, |
| "step": 608 |
| }, |
| { |
| "epoch": 1.2821052631578946, |
| "grad_norm": 3.8066305763587605, |
| "learning_rate": 2.8561541678638145e-06, |
| "loss": 0.1121, |
| "step": 609 |
| }, |
| { |
| "epoch": 1.2842105263157895, |
| "grad_norm": 2.896761981069954, |
| "learning_rate": 2.8412282383075362e-06, |
| "loss": 0.0972, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.2863157894736843, |
| "grad_norm": 2.78332338625659, |
| "learning_rate": 2.826325916733378e-06, |
| "loss": 0.0944, |
| "step": 611 |
| }, |
| { |
| "epoch": 1.288421052631579, |
| "grad_norm": 3.3783431646548583, |
| "learning_rate": 2.811447366110741e-06, |
| "loss": 0.0838, |
| "step": 612 |
| }, |
| { |
| "epoch": 1.2905263157894737, |
| "grad_norm": 2.5129928876526195, |
| "learning_rate": 2.796592749149071e-06, |
| "loss": 0.1033, |
| "step": 613 |
| }, |
| { |
| "epoch": 1.2926315789473684, |
| "grad_norm": 2.7310324993825272, |
| "learning_rate": 2.7817622282960816e-06, |
| "loss": 0.0822, |
| "step": 614 |
| }, |
| { |
| "epoch": 1.2947368421052632, |
| "grad_norm": 2.7894468868847584, |
| "learning_rate": 2.766955965735968e-06, |
| "loss": 0.097, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.296842105263158, |
| "grad_norm": 2.692351340111761, |
| "learning_rate": 2.7521741233876496e-06, |
| "loss": 0.1008, |
| "step": 616 |
| }, |
| { |
| "epoch": 1.2989473684210526, |
| "grad_norm": 2.7211489958669426, |
| "learning_rate": 2.7374168629029814e-06, |
| "loss": 0.0662, |
| "step": 617 |
| }, |
| { |
| "epoch": 1.3010526315789472, |
| "grad_norm": 3.0694815194789506, |
| "learning_rate": 2.722684345665004e-06, |
| "loss": 0.1068, |
| "step": 618 |
| }, |
| { |
| "epoch": 1.303157894736842, |
| "grad_norm": 2.311408410154088, |
| "learning_rate": 2.707976732786166e-06, |
| "loss": 0.0855, |
| "step": 619 |
| }, |
| { |
| "epoch": 1.305263157894737, |
| "grad_norm": 2.67419162103436, |
| "learning_rate": 2.693294185106562e-06, |
| "loss": 0.079, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.3073684210526315, |
| "grad_norm": 3.2884510438078256, |
| "learning_rate": 2.678636863192184e-06, |
| "loss": 0.1076, |
| "step": 621 |
| }, |
| { |
| "epoch": 1.3094736842105263, |
| "grad_norm": 2.5974919371371192, |
| "learning_rate": 2.6640049273331516e-06, |
| "loss": 0.0738, |
| "step": 622 |
| }, |
| { |
| "epoch": 1.311578947368421, |
| "grad_norm": 2.9992265829306604, |
| "learning_rate": 2.649398537541978e-06, |
| "loss": 0.0948, |
| "step": 623 |
| }, |
| { |
| "epoch": 1.3136842105263158, |
| "grad_norm": 2.8279603530668953, |
| "learning_rate": 2.6348178535517967e-06, |
| "loss": 0.0952, |
| "step": 624 |
| }, |
| { |
| "epoch": 1.3157894736842106, |
| "grad_norm": 2.616721301345279, |
| "learning_rate": 2.6202630348146323e-06, |
| "loss": 0.0991, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.3178947368421052, |
| "grad_norm": 3.1702653547071344, |
| "learning_rate": 2.605734240499652e-06, |
| "loss": 0.0863, |
| "step": 626 |
| }, |
| { |
| "epoch": 1.32, |
| "grad_norm": 2.675405760590981, |
| "learning_rate": 2.5912316294914232e-06, |
| "loss": 0.0731, |
| "step": 627 |
| }, |
| { |
| "epoch": 1.3221052631578947, |
| "grad_norm": 3.0539267481690784, |
| "learning_rate": 2.576755360388177e-06, |
| "loss": 0.0969, |
| "step": 628 |
| }, |
| { |
| "epoch": 1.3242105263157895, |
| "grad_norm": 2.905114269140358, |
| "learning_rate": 2.562305591500069e-06, |
| "loss": 0.0938, |
| "step": 629 |
| }, |
| { |
| "epoch": 1.3263157894736843, |
| "grad_norm": 2.731625609008078, |
| "learning_rate": 2.5478824808474613e-06, |
| "loss": 0.0794, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.328421052631579, |
| "grad_norm": 2.613073402073297, |
| "learning_rate": 2.5334861861591753e-06, |
| "loss": 0.0854, |
| "step": 631 |
| }, |
| { |
| "epoch": 1.3305263157894736, |
| "grad_norm": 2.9544597662379553, |
| "learning_rate": 2.5191168648707888e-06, |
| "loss": 0.0999, |
| "step": 632 |
| }, |
| { |
| "epoch": 1.3326315789473684, |
| "grad_norm": 2.7561606747769094, |
| "learning_rate": 2.5047746741228977e-06, |
| "loss": 0.0714, |
| "step": 633 |
| }, |
| { |
| "epoch": 1.3347368421052632, |
| "grad_norm": 2.8887518490755153, |
| "learning_rate": 2.490459770759398e-06, |
| "loss": 0.0939, |
| "step": 634 |
| }, |
| { |
| "epoch": 1.3368421052631578, |
| "grad_norm": 3.317195982139847, |
| "learning_rate": 2.476172311325783e-06, |
| "loss": 0.0884, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.3389473684210527, |
| "grad_norm": 3.1750724188681003, |
| "learning_rate": 2.461912452067415e-06, |
| "loss": 0.0936, |
| "step": 636 |
| }, |
| { |
| "epoch": 1.3410526315789473, |
| "grad_norm": 3.618501354891483, |
| "learning_rate": 2.447680348927837e-06, |
| "loss": 0.1202, |
| "step": 637 |
| }, |
| { |
| "epoch": 1.343157894736842, |
| "grad_norm": 3.3236366679875715, |
| "learning_rate": 2.433476157547044e-06, |
| "loss": 0.1117, |
| "step": 638 |
| }, |
| { |
| "epoch": 1.345263157894737, |
| "grad_norm": 2.752799678732494, |
| "learning_rate": 2.4193000332597984e-06, |
| "loss": 0.1264, |
| "step": 639 |
| }, |
| { |
| "epoch": 1.3473684210526315, |
| "grad_norm": 2.747193986144216, |
| "learning_rate": 2.4051521310939258e-06, |
| "loss": 0.106, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.3494736842105264, |
| "grad_norm": 2.972619060264223, |
| "learning_rate": 2.391032605768613e-06, |
| "loss": 0.0923, |
| "step": 641 |
| }, |
| { |
| "epoch": 1.351578947368421, |
| "grad_norm": 3.053201871204932, |
| "learning_rate": 2.3769416116927335e-06, |
| "loss": 0.115, |
| "step": 642 |
| }, |
| { |
| "epoch": 1.3536842105263158, |
| "grad_norm": 2.7376151110688522, |
| "learning_rate": 2.3628793029631353e-06, |
| "loss": 0.116, |
| "step": 643 |
| }, |
| { |
| "epoch": 1.3557894736842107, |
| "grad_norm": 2.9174931758051836, |
| "learning_rate": 2.3488458333629777e-06, |
| "loss": 0.1051, |
| "step": 644 |
| }, |
| { |
| "epoch": 1.3578947368421053, |
| "grad_norm": 2.4716264469344704, |
| "learning_rate": 2.3348413563600324e-06, |
| "loss": 0.1107, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.3599999999999999, |
| "grad_norm": 3.0530343837320117, |
| "learning_rate": 2.320866025105016e-06, |
| "loss": 0.1334, |
| "step": 646 |
| }, |
| { |
| "epoch": 1.3621052631578947, |
| "grad_norm": 3.6619589060484645, |
| "learning_rate": 2.3069199924299175e-06, |
| "loss": 0.1366, |
| "step": 647 |
| }, |
| { |
| "epoch": 1.3642105263157895, |
| "grad_norm": 2.922855147887494, |
| "learning_rate": 2.29300341084631e-06, |
| "loss": 0.094, |
| "step": 648 |
| }, |
| { |
| "epoch": 1.3663157894736842, |
| "grad_norm": 2.550799037652647, |
| "learning_rate": 2.2791164325437047e-06, |
| "loss": 0.0855, |
| "step": 649 |
| }, |
| { |
| "epoch": 1.368421052631579, |
| "grad_norm": 2.9412512573198053, |
| "learning_rate": 2.265259209387867e-06, |
| "loss": 0.1052, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.3705263157894736, |
| "grad_norm": 2.511840236449145, |
| "learning_rate": 2.2514318929191707e-06, |
| "loss": 0.0708, |
| "step": 651 |
| }, |
| { |
| "epoch": 1.3726315789473684, |
| "grad_norm": 3.1808030308692596, |
| "learning_rate": 2.2376346343509343e-06, |
| "loss": 0.0748, |
| "step": 652 |
| }, |
| { |
| "epoch": 1.3747368421052633, |
| "grad_norm": 3.3523627968871987, |
| "learning_rate": 2.2238675845677663e-06, |
| "loss": 0.0983, |
| "step": 653 |
| }, |
| { |
| "epoch": 1.3768421052631579, |
| "grad_norm": 2.932190635989121, |
| "learning_rate": 2.2101308941239204e-06, |
| "loss": 0.0637, |
| "step": 654 |
| }, |
| { |
| "epoch": 1.3789473684210527, |
| "grad_norm": 3.5807536284623835, |
| "learning_rate": 2.1964247132416373e-06, |
| "loss": 0.1136, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.3810526315789473, |
| "grad_norm": 2.9803291761278263, |
| "learning_rate": 2.182749191809518e-06, |
| "loss": 0.0949, |
| "step": 656 |
| }, |
| { |
| "epoch": 1.3831578947368421, |
| "grad_norm": 3.02234271668722, |
| "learning_rate": 2.1691044793808734e-06, |
| "loss": 0.0999, |
| "step": 657 |
| }, |
| { |
| "epoch": 1.385263157894737, |
| "grad_norm": 2.679900836385144, |
| "learning_rate": 2.1554907251720947e-06, |
| "loss": 0.0785, |
| "step": 658 |
| }, |
| { |
| "epoch": 1.3873684210526316, |
| "grad_norm": 2.2709498222109366, |
| "learning_rate": 2.1419080780610123e-06, |
| "loss": 0.066, |
| "step": 659 |
| }, |
| { |
| "epoch": 1.3894736842105262, |
| "grad_norm": 3.0917162559984215, |
| "learning_rate": 2.1283566865852824e-06, |
| "loss": 0.1, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.391578947368421, |
| "grad_norm": 2.5872224056024313, |
| "learning_rate": 2.11483669894075e-06, |
| "loss": 0.09, |
| "step": 661 |
| }, |
| { |
| "epoch": 1.3936842105263159, |
| "grad_norm": 2.5962906490038, |
| "learning_rate": 2.1013482629798334e-06, |
| "loss": 0.0755, |
| "step": 662 |
| }, |
| { |
| "epoch": 1.3957894736842105, |
| "grad_norm": 2.9664694685613333, |
| "learning_rate": 2.08789152620991e-06, |
| "loss": 0.1177, |
| "step": 663 |
| }, |
| { |
| "epoch": 1.3978947368421053, |
| "grad_norm": 2.7105630014133517, |
| "learning_rate": 2.0744666357916925e-06, |
| "loss": 0.0926, |
| "step": 664 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 2.762223630715002, |
| "learning_rate": 2.061073738537635e-06, |
| "loss": 0.0989, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.4021052631578947, |
| "grad_norm": 2.895940938610082, |
| "learning_rate": 2.0477129809103147e-06, |
| "loss": 0.0845, |
| "step": 666 |
| }, |
| { |
| "epoch": 1.4042105263157896, |
| "grad_norm": 3.255484673069398, |
| "learning_rate": 2.034384509020837e-06, |
| "loss": 0.0987, |
| "step": 667 |
| }, |
| { |
| "epoch": 1.4063157894736842, |
| "grad_norm": 3.062882069284984, |
| "learning_rate": 2.021088468627237e-06, |
| "loss": 0.1285, |
| "step": 668 |
| }, |
| { |
| "epoch": 1.408421052631579, |
| "grad_norm": 2.450287955725665, |
| "learning_rate": 2.0078250051328783e-06, |
| "loss": 0.0728, |
| "step": 669 |
| }, |
| { |
| "epoch": 1.4105263157894736, |
| "grad_norm": 3.470552198050232, |
| "learning_rate": 1.9945942635848745e-06, |
| "loss": 0.1207, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.4126315789473685, |
| "grad_norm": 2.8537747610834385, |
| "learning_rate": 1.981396388672496e-06, |
| "loss": 0.1049, |
| "step": 671 |
| }, |
| { |
| "epoch": 1.4147368421052633, |
| "grad_norm": 3.7024680682537885, |
| "learning_rate": 1.9682315247255897e-06, |
| "loss": 0.1043, |
| "step": 672 |
| }, |
| { |
| "epoch": 1.416842105263158, |
| "grad_norm": 2.8609102232586743, |
| "learning_rate": 1.9550998157129946e-06, |
| "loss": 0.0694, |
| "step": 673 |
| }, |
| { |
| "epoch": 1.4189473684210525, |
| "grad_norm": 3.262083913972591, |
| "learning_rate": 1.9420014052409793e-06, |
| "loss": 0.1082, |
| "step": 674 |
| }, |
| { |
| "epoch": 1.4210526315789473, |
| "grad_norm": 3.0240021646625235, |
| "learning_rate": 1.928936436551661e-06, |
| "loss": 0.0882, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.4231578947368422, |
| "grad_norm": 3.4463090162258827, |
| "learning_rate": 1.915905052521445e-06, |
| "loss": 0.0924, |
| "step": 676 |
| }, |
| { |
| "epoch": 1.4252631578947368, |
| "grad_norm": 2.431106797497799, |
| "learning_rate": 1.9029073956594607e-06, |
| "loss": 0.0887, |
| "step": 677 |
| }, |
| { |
| "epoch": 1.4273684210526316, |
| "grad_norm": 2.915704184565381, |
| "learning_rate": 1.8899436081059974e-06, |
| "loss": 0.0847, |
| "step": 678 |
| }, |
| { |
| "epoch": 1.4294736842105262, |
| "grad_norm": 2.7352077174816993, |
| "learning_rate": 1.877013831630961e-06, |
| "loss": 0.0768, |
| "step": 679 |
| }, |
| { |
| "epoch": 1.431578947368421, |
| "grad_norm": 2.8189895917814205, |
| "learning_rate": 1.864118207632315e-06, |
| "loss": 0.0785, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.433684210526316, |
| "grad_norm": 2.664425517756487, |
| "learning_rate": 1.851256877134538e-06, |
| "loss": 0.0836, |
| "step": 681 |
| }, |
| { |
| "epoch": 1.4357894736842105, |
| "grad_norm": 2.9705486470960083, |
| "learning_rate": 1.838429980787081e-06, |
| "loss": 0.1191, |
| "step": 682 |
| }, |
| { |
| "epoch": 1.4378947368421053, |
| "grad_norm": 2.540310721891877, |
| "learning_rate": 1.825637658862824e-06, |
| "loss": 0.0878, |
| "step": 683 |
| }, |
| { |
| "epoch": 1.44, |
| "grad_norm": 2.661992219604887, |
| "learning_rate": 1.8128800512565514e-06, |
| "loss": 0.0929, |
| "step": 684 |
| }, |
| { |
| "epoch": 1.4421052631578948, |
| "grad_norm": 3.04507185100223, |
| "learning_rate": 1.8001572974834169e-06, |
| "loss": 0.1103, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.4442105263157896, |
| "grad_norm": 2.9020461461830642, |
| "learning_rate": 1.7874695366774191e-06, |
| "loss": 0.11, |
| "step": 686 |
| }, |
| { |
| "epoch": 1.4463157894736842, |
| "grad_norm": 3.11468571675733, |
| "learning_rate": 1.774816907589873e-06, |
| "loss": 0.0998, |
| "step": 687 |
| }, |
| { |
| "epoch": 1.4484210526315788, |
| "grad_norm": 2.359239660620598, |
| "learning_rate": 1.7621995485879062e-06, |
| "loss": 0.075, |
| "step": 688 |
| }, |
| { |
| "epoch": 1.4505263157894737, |
| "grad_norm": 3.2955548369625465, |
| "learning_rate": 1.749617597652934e-06, |
| "loss": 0.0777, |
| "step": 689 |
| }, |
| { |
| "epoch": 1.4526315789473685, |
| "grad_norm": 2.8006711851285577, |
| "learning_rate": 1.7370711923791567e-06, |
| "loss": 0.112, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.454736842105263, |
| "grad_norm": 3.029355082755488, |
| "learning_rate": 1.7245604699720536e-06, |
| "loss": 0.0633, |
| "step": 691 |
| }, |
| { |
| "epoch": 1.456842105263158, |
| "grad_norm": 3.304711518434829, |
| "learning_rate": 1.7120855672468779e-06, |
| "loss": 0.0883, |
| "step": 692 |
| }, |
| { |
| "epoch": 1.4589473684210525, |
| "grad_norm": 2.765032151598941, |
| "learning_rate": 1.6996466206271679e-06, |
| "loss": 0.0793, |
| "step": 693 |
| }, |
| { |
| "epoch": 1.4610526315789474, |
| "grad_norm": 3.2989793435037122, |
| "learning_rate": 1.6872437661432518e-06, |
| "loss": 0.1019, |
| "step": 694 |
| }, |
| { |
| "epoch": 1.4631578947368422, |
| "grad_norm": 3.419570656017536, |
| "learning_rate": 1.6748771394307584e-06, |
| "loss": 0.1102, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.4652631578947368, |
| "grad_norm": 2.6151861620453696, |
| "learning_rate": 1.6625468757291379e-06, |
| "loss": 0.0815, |
| "step": 696 |
| }, |
| { |
| "epoch": 1.4673684210526317, |
| "grad_norm": 2.8968743795344594, |
| "learning_rate": 1.6502531098801756e-06, |
| "loss": 0.117, |
| "step": 697 |
| }, |
| { |
| "epoch": 1.4694736842105263, |
| "grad_norm": 3.048208312310303, |
| "learning_rate": 1.6379959763265268e-06, |
| "loss": 0.1159, |
| "step": 698 |
| }, |
| { |
| "epoch": 1.471578947368421, |
| "grad_norm": 3.0348755335107644, |
| "learning_rate": 1.62577560911024e-06, |
| "loss": 0.0954, |
| "step": 699 |
| }, |
| { |
| "epoch": 1.4736842105263157, |
| "grad_norm": 2.731399950881053, |
| "learning_rate": 1.6135921418712959e-06, |
| "loss": 0.0796, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.4757894736842105, |
| "grad_norm": 2.450146380331398, |
| "learning_rate": 1.6014457078461354e-06, |
| "loss": 0.0818, |
| "step": 701 |
| }, |
| { |
| "epoch": 1.4778947368421052, |
| "grad_norm": 3.1040839922685945, |
| "learning_rate": 1.5893364398662175e-06, |
| "loss": 0.1282, |
| "step": 702 |
| }, |
| { |
| "epoch": 1.48, |
| "grad_norm": 3.3128390186937953, |
| "learning_rate": 1.5772644703565564e-06, |
| "loss": 0.1131, |
| "step": 703 |
| }, |
| { |
| "epoch": 1.4821052631578948, |
| "grad_norm": 2.782315822851225, |
| "learning_rate": 1.5652299313342772e-06, |
| "loss": 0.1103, |
| "step": 704 |
| }, |
| { |
| "epoch": 1.4842105263157894, |
| "grad_norm": 3.348999608405835, |
| "learning_rate": 1.5532329544071712e-06, |
| "loss": 0.1013, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.4863157894736843, |
| "grad_norm": 2.4420984824535874, |
| "learning_rate": 1.5412736707722537e-06, |
| "loss": 0.0961, |
| "step": 706 |
| }, |
| { |
| "epoch": 1.4884210526315789, |
| "grad_norm": 3.27158380123177, |
| "learning_rate": 1.5293522112143371e-06, |
| "loss": 0.1032, |
| "step": 707 |
| }, |
| { |
| "epoch": 1.4905263157894737, |
| "grad_norm": 2.5206467029781425, |
| "learning_rate": 1.517468706104589e-06, |
| "loss": 0.0673, |
| "step": 708 |
| }, |
| { |
| "epoch": 1.4926315789473685, |
| "grad_norm": 3.0061836822371304, |
| "learning_rate": 1.505623285399121e-06, |
| "loss": 0.0912, |
| "step": 709 |
| }, |
| { |
| "epoch": 1.4947368421052631, |
| "grad_norm": 3.6329090622373013, |
| "learning_rate": 1.4938160786375571e-06, |
| "loss": 0.1238, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.496842105263158, |
| "grad_norm": 3.357709020463055, |
| "learning_rate": 1.4820472149416153e-06, |
| "loss": 0.1033, |
| "step": 711 |
| }, |
| { |
| "epoch": 1.4989473684210526, |
| "grad_norm": 2.982027831242349, |
| "learning_rate": 1.4703168230137072e-06, |
| "loss": 0.0745, |
| "step": 712 |
| }, |
| { |
| "epoch": 1.5010526315789474, |
| "grad_norm": 3.2153075823991433, |
| "learning_rate": 1.4586250311355132e-06, |
| "loss": 0.0748, |
| "step": 713 |
| }, |
| { |
| "epoch": 1.5031578947368422, |
| "grad_norm": 3.089733805213981, |
| "learning_rate": 1.4469719671666043e-06, |
| "loss": 0.1173, |
| "step": 714 |
| }, |
| { |
| "epoch": 1.5052631578947369, |
| "grad_norm": 2.5160669473252857, |
| "learning_rate": 1.4353577585430152e-06, |
| "loss": 0.065, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.5073684210526315, |
| "grad_norm": 2.479299583306584, |
| "learning_rate": 1.4237825322758735e-06, |
| "loss": 0.074, |
| "step": 716 |
| }, |
| { |
| "epoch": 1.5094736842105263, |
| "grad_norm": 2.429769219845503, |
| "learning_rate": 1.412246414949997e-06, |
| "loss": 0.0876, |
| "step": 717 |
| }, |
| { |
| "epoch": 1.5115789473684211, |
| "grad_norm": 2.4449904248685743, |
| "learning_rate": 1.4007495327225162e-06, |
| "loss": 0.0925, |
| "step": 718 |
| }, |
| { |
| "epoch": 1.5136842105263157, |
| "grad_norm": 2.8176065325452755, |
| "learning_rate": 1.389292011321498e-06, |
| "loss": 0.1018, |
| "step": 719 |
| }, |
| { |
| "epoch": 1.5157894736842106, |
| "grad_norm": 3.1723117780798127, |
| "learning_rate": 1.3778739760445552e-06, |
| "loss": 0.1117, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.5178947368421052, |
| "grad_norm": 2.7373105699595466, |
| "learning_rate": 1.3664955517574967e-06, |
| "loss": 0.0748, |
| "step": 721 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 2.7280985272673495, |
| "learning_rate": 1.3551568628929434e-06, |
| "loss": 0.0755, |
| "step": 722 |
| }, |
| { |
| "epoch": 1.5221052631578948, |
| "grad_norm": 3.328587344616802, |
| "learning_rate": 1.343858033448982e-06, |
| "loss": 0.1077, |
| "step": 723 |
| }, |
| { |
| "epoch": 1.5242105263157895, |
| "grad_norm": 2.5855488551749906, |
| "learning_rate": 1.3325991869878013e-06, |
| "loss": 0.071, |
| "step": 724 |
| }, |
| { |
| "epoch": 1.526315789473684, |
| "grad_norm": 3.0254177227309165, |
| "learning_rate": 1.321380446634342e-06, |
| "loss": 0.1301, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.528421052631579, |
| "grad_norm": 2.3929353303305274, |
| "learning_rate": 1.3102019350749528e-06, |
| "loss": 0.0688, |
| "step": 726 |
| }, |
| { |
| "epoch": 1.5305263157894737, |
| "grad_norm": 2.21131287248272, |
| "learning_rate": 1.2990637745560418e-06, |
| "loss": 0.0525, |
| "step": 727 |
| }, |
| { |
| "epoch": 1.5326315789473686, |
| "grad_norm": 2.738513806958703, |
| "learning_rate": 1.2879660868827508e-06, |
| "loss": 0.0767, |
| "step": 728 |
| }, |
| { |
| "epoch": 1.5347368421052632, |
| "grad_norm": 3.6188389817753053, |
| "learning_rate": 1.2769089934176126e-06, |
| "loss": 0.099, |
| "step": 729 |
| }, |
| { |
| "epoch": 1.5368421052631578, |
| "grad_norm": 3.045111314231803, |
| "learning_rate": 1.2658926150792321e-06, |
| "loss": 0.073, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.5389473684210526, |
| "grad_norm": 3.184073980186101, |
| "learning_rate": 1.2549170723409548e-06, |
| "loss": 0.1014, |
| "step": 731 |
| }, |
| { |
| "epoch": 1.5410526315789475, |
| "grad_norm": 2.579054827239382, |
| "learning_rate": 1.243982485229559e-06, |
| "loss": 0.0795, |
| "step": 732 |
| }, |
| { |
| "epoch": 1.543157894736842, |
| "grad_norm": 3.416615737801065, |
| "learning_rate": 1.233088973323937e-06, |
| "loss": 0.0964, |
| "step": 733 |
| }, |
| { |
| "epoch": 1.545263157894737, |
| "grad_norm": 2.785734769889791, |
| "learning_rate": 1.2222366557537911e-06, |
| "loss": 0.0907, |
| "step": 734 |
| }, |
| { |
| "epoch": 1.5473684210526315, |
| "grad_norm": 2.970957261629437, |
| "learning_rate": 1.2114256511983274e-06, |
| "loss": 0.1131, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.5494736842105263, |
| "grad_norm": 2.8363779652363417, |
| "learning_rate": 1.200656077884958e-06, |
| "loss": 0.0872, |
| "step": 736 |
| }, |
| { |
| "epoch": 1.5515789473684212, |
| "grad_norm": 2.7036119445964903, |
| "learning_rate": 1.189928053588012e-06, |
| "loss": 0.0964, |
| "step": 737 |
| }, |
| { |
| "epoch": 1.5536842105263158, |
| "grad_norm": 3.3335857753060516, |
| "learning_rate": 1.1792416956274443e-06, |
| "loss": 0.0786, |
| "step": 738 |
| }, |
| { |
| "epoch": 1.5557894736842104, |
| "grad_norm": 3.0926395174578056, |
| "learning_rate": 1.1685971208675539e-06, |
| "loss": 0.1136, |
| "step": 739 |
| }, |
| { |
| "epoch": 1.5578947368421052, |
| "grad_norm": 3.3419216028091627, |
| "learning_rate": 1.157994445715706e-06, |
| "loss": 0.1071, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.56, |
| "grad_norm": 2.5742088370461005, |
| "learning_rate": 1.1474337861210543e-06, |
| "loss": 0.0929, |
| "step": 741 |
| }, |
| { |
| "epoch": 1.5621052631578949, |
| "grad_norm": 2.9130570321225955, |
| "learning_rate": 1.1369152575732823e-06, |
| "loss": 0.0717, |
| "step": 742 |
| }, |
| { |
| "epoch": 1.5642105263157895, |
| "grad_norm": 3.2401289778589417, |
| "learning_rate": 1.1264389751013326e-06, |
| "loss": 0.0987, |
| "step": 743 |
| }, |
| { |
| "epoch": 1.566315789473684, |
| "grad_norm": 2.4927969213579355, |
| "learning_rate": 1.1160050532721527e-06, |
| "loss": 0.0885, |
| "step": 744 |
| }, |
| { |
| "epoch": 1.568421052631579, |
| "grad_norm": 2.938530201077573, |
| "learning_rate": 1.1056136061894386e-06, |
| "loss": 0.127, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.5705263157894738, |
| "grad_norm": 3.3187567343589444, |
| "learning_rate": 1.095264747492391e-06, |
| "loss": 0.1238, |
| "step": 746 |
| }, |
| { |
| "epoch": 1.5726315789473684, |
| "grad_norm": 3.8303977690990356, |
| "learning_rate": 1.0849585903544707e-06, |
| "loss": 0.1056, |
| "step": 747 |
| }, |
| { |
| "epoch": 1.5747368421052632, |
| "grad_norm": 3.14168947299502, |
| "learning_rate": 1.0746952474821615e-06, |
| "loss": 0.1123, |
| "step": 748 |
| }, |
| { |
| "epoch": 1.5768421052631578, |
| "grad_norm": 2.7173637195455473, |
| "learning_rate": 1.0644748311137377e-06, |
| "loss": 0.0663, |
| "step": 749 |
| }, |
| { |
| "epoch": 1.5789473684210527, |
| "grad_norm": 2.5766025285393144, |
| "learning_rate": 1.0542974530180327e-06, |
| "loss": 0.0977, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.5810526315789475, |
| "grad_norm": 2.609993016293197, |
| "learning_rate": 1.0441632244932238e-06, |
| "loss": 0.0884, |
| "step": 751 |
| }, |
| { |
| "epoch": 1.583157894736842, |
| "grad_norm": 3.0209923604689077, |
| "learning_rate": 1.0340722563656109e-06, |
| "loss": 0.0964, |
| "step": 752 |
| }, |
| { |
| "epoch": 1.5852631578947367, |
| "grad_norm": 2.6635239464568286, |
| "learning_rate": 1.0240246589884046e-06, |
| "loss": 0.0657, |
| "step": 753 |
| }, |
| { |
| "epoch": 1.5873684210526315, |
| "grad_norm": 2.987774944183331, |
| "learning_rate": 1.0140205422405213e-06, |
| "loss": 0.0851, |
| "step": 754 |
| }, |
| { |
| "epoch": 1.5894736842105264, |
| "grad_norm": 3.3766295151075365, |
| "learning_rate": 1.0040600155253766e-06, |
| "loss": 0.1112, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.5915789473684212, |
| "grad_norm": 3.0226247449523527, |
| "learning_rate": 9.941431877696955e-07, |
| "loss": 0.0976, |
| "step": 756 |
| }, |
| { |
| "epoch": 1.5936842105263158, |
| "grad_norm": 2.9054545208229405, |
| "learning_rate": 9.842701674223187e-07, |
| "loss": 0.0914, |
| "step": 757 |
| }, |
| { |
| "epoch": 1.5957894736842104, |
| "grad_norm": 2.610578072050956, |
| "learning_rate": 9.744410624530148e-07, |
| "loss": 0.0677, |
| "step": 758 |
| }, |
| { |
| "epoch": 1.5978947368421053, |
| "grad_norm": 3.6096944620452533, |
| "learning_rate": 9.646559803512995e-07, |
| "loss": 0.1039, |
| "step": 759 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 2.608548813892452, |
| "learning_rate": 9.549150281252633e-07, |
| "loss": 0.0854, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.6021052631578947, |
| "grad_norm": 2.294848325794609, |
| "learning_rate": 9.452183123003999e-07, |
| "loss": 0.0712, |
| "step": 761 |
| }, |
| { |
| "epoch": 1.6042105263157893, |
| "grad_norm": 2.812243878488753, |
| "learning_rate": 9.355659389184396e-07, |
| "loss": 0.1059, |
| "step": 762 |
| }, |
| { |
| "epoch": 1.6063157894736841, |
| "grad_norm": 3.5425866420134806, |
| "learning_rate": 9.259580135361929e-07, |
| "loss": 0.1032, |
| "step": 763 |
| }, |
| { |
| "epoch": 1.608421052631579, |
| "grad_norm": 2.8282691007632574, |
| "learning_rate": 9.163946412243896e-07, |
| "loss": 0.0948, |
| "step": 764 |
| }, |
| { |
| "epoch": 1.6105263157894738, |
| "grad_norm": 2.7856179217728023, |
| "learning_rate": 9.068759265665384e-07, |
| "loss": 0.0806, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.6126315789473684, |
| "grad_norm": 2.7383044343143843, |
| "learning_rate": 8.974019736577777e-07, |
| "loss": 0.097, |
| "step": 766 |
| }, |
| { |
| "epoch": 1.614736842105263, |
| "grad_norm": 2.402534553214678, |
| "learning_rate": 8.879728861037385e-07, |
| "loss": 0.0946, |
| "step": 767 |
| }, |
| { |
| "epoch": 1.6168421052631579, |
| "grad_norm": 2.571133488597209, |
| "learning_rate": 8.785887670194137e-07, |
| "loss": 0.0743, |
| "step": 768 |
| }, |
| { |
| "epoch": 1.6189473684210527, |
| "grad_norm": 2.3849456172715144, |
| "learning_rate": 8.692497190280225e-07, |
| "loss": 0.122, |
| "step": 769 |
| }, |
| { |
| "epoch": 1.6210526315789475, |
| "grad_norm": 2.4256426604960684, |
| "learning_rate": 8.599558442598998e-07, |
| "loss": 0.0983, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.6231578947368421, |
| "grad_norm": 3.1138269135730074, |
| "learning_rate": 8.507072443513703e-07, |
| "loss": 0.0681, |
| "step": 771 |
| }, |
| { |
| "epoch": 1.6252631578947367, |
| "grad_norm": 2.5456622584996205, |
| "learning_rate": 8.415040204436426e-07, |
| "loss": 0.0797, |
| "step": 772 |
| }, |
| { |
| "epoch": 1.6273684210526316, |
| "grad_norm": 2.4765639393665264, |
| "learning_rate": 8.323462731816962e-07, |
| "loss": 0.0808, |
| "step": 773 |
| }, |
| { |
| "epoch": 1.6294736842105264, |
| "grad_norm": 3.0808120883207053, |
| "learning_rate": 8.232341027131885e-07, |
| "loss": 0.1246, |
| "step": 774 |
| }, |
| { |
| "epoch": 1.631578947368421, |
| "grad_norm": 2.1969262047815743, |
| "learning_rate": 8.141676086873574e-07, |
| "loss": 0.0769, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.6336842105263156, |
| "grad_norm": 2.620040982916962, |
| "learning_rate": 8.051468902539272e-07, |
| "loss": 0.0626, |
| "step": 776 |
| }, |
| { |
| "epoch": 1.6357894736842105, |
| "grad_norm": 2.3817308039237943, |
| "learning_rate": 7.961720460620321e-07, |
| "loss": 0.0606, |
| "step": 777 |
| }, |
| { |
| "epoch": 1.6378947368421053, |
| "grad_norm": 2.3280487070318077, |
| "learning_rate": 7.872431742591268e-07, |
| "loss": 0.0766, |
| "step": 778 |
| }, |
| { |
| "epoch": 1.6400000000000001, |
| "grad_norm": 3.073580547488845, |
| "learning_rate": 7.783603724899258e-07, |
| "loss": 0.0877, |
| "step": 779 |
| }, |
| { |
| "epoch": 1.6421052631578947, |
| "grad_norm": 2.9969577749713636, |
| "learning_rate": 7.695237378953224e-07, |
| "loss": 0.1094, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.6442105263157893, |
| "grad_norm": 3.1923236806085726, |
| "learning_rate": 7.607333671113409e-07, |
| "loss": 0.1203, |
| "step": 781 |
| }, |
| { |
| "epoch": 1.6463157894736842, |
| "grad_norm": 3.394844382115183, |
| "learning_rate": 7.519893562680663e-07, |
| "loss": 0.0974, |
| "step": 782 |
| }, |
| { |
| "epoch": 1.648421052631579, |
| "grad_norm": 2.9206691570301317, |
| "learning_rate": 7.432918009885997e-07, |
| "loss": 0.1201, |
| "step": 783 |
| }, |
| { |
| "epoch": 1.6505263157894738, |
| "grad_norm": 2.5882116062754097, |
| "learning_rate": 7.346407963880137e-07, |
| "loss": 0.0782, |
| "step": 784 |
| }, |
| { |
| "epoch": 1.6526315789473685, |
| "grad_norm": 2.999647632411743, |
| "learning_rate": 7.260364370723044e-07, |
| "loss": 0.1032, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.654736842105263, |
| "grad_norm": 2.7057840276614527, |
| "learning_rate": 7.174788171373731e-07, |
| "loss": 0.1068, |
| "step": 786 |
| }, |
| { |
| "epoch": 1.656842105263158, |
| "grad_norm": 3.1605976272604797, |
| "learning_rate": 7.089680301679752e-07, |
| "loss": 0.086, |
| "step": 787 |
| }, |
| { |
| "epoch": 1.6589473684210527, |
| "grad_norm": 2.0265271144542085, |
| "learning_rate": 7.005041692367154e-07, |
| "loss": 0.0654, |
| "step": 788 |
| }, |
| { |
| "epoch": 1.6610526315789473, |
| "grad_norm": 3.0449660028759076, |
| "learning_rate": 6.92087326903022e-07, |
| "loss": 0.1064, |
| "step": 789 |
| }, |
| { |
| "epoch": 1.663157894736842, |
| "grad_norm": 2.846318524701942, |
| "learning_rate": 6.837175952121305e-07, |
| "loss": 0.1143, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.6652631578947368, |
| "grad_norm": 2.2451563492479387, |
| "learning_rate": 6.753950656940905e-07, |
| "loss": 0.0687, |
| "step": 791 |
| }, |
| { |
| "epoch": 1.6673684210526316, |
| "grad_norm": 2.6443002936675675, |
| "learning_rate": 6.671198293627479e-07, |
| "loss": 0.0663, |
| "step": 792 |
| }, |
| { |
| "epoch": 1.6694736842105264, |
| "grad_norm": 3.990455910934455, |
| "learning_rate": 6.58891976714764e-07, |
| "loss": 0.1302, |
| "step": 793 |
| }, |
| { |
| "epoch": 1.671578947368421, |
| "grad_norm": 2.7128378446081336, |
| "learning_rate": 6.507115977286144e-07, |
| "loss": 0.0681, |
| "step": 794 |
| }, |
| { |
| "epoch": 1.6736842105263157, |
| "grad_norm": 2.883050359735446, |
| "learning_rate": 6.425787818636131e-07, |
| "loss": 0.0864, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.6757894736842105, |
| "grad_norm": 2.6456133797383674, |
| "learning_rate": 6.34493618058935e-07, |
| "loss": 0.0587, |
| "step": 796 |
| }, |
| { |
| "epoch": 1.6778947368421053, |
| "grad_norm": 3.139020707158627, |
| "learning_rate": 6.264561947326331e-07, |
| "loss": 0.0733, |
| "step": 797 |
| }, |
| { |
| "epoch": 1.6800000000000002, |
| "grad_norm": 3.4935886643877536, |
| "learning_rate": 6.184665997806832e-07, |
| "loss": 0.1061, |
| "step": 798 |
| }, |
| { |
| "epoch": 1.6821052631578948, |
| "grad_norm": 3.340770162416328, |
| "learning_rate": 6.105249205760128e-07, |
| "loss": 0.1071, |
| "step": 799 |
| }, |
| { |
| "epoch": 1.6842105263157894, |
| "grad_norm": 2.8315166821387794, |
| "learning_rate": 6.026312439675553e-07, |
| "loss": 0.0933, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.6842105263157894, |
| "eval_loss": 0.23096558451652527, |
| "eval_runtime": 0.9217, |
| "eval_samples_per_second": 42.312, |
| "eval_steps_per_second": 10.849, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.6863157894736842, |
| "grad_norm": 3.8068745353327795, |
| "learning_rate": 5.947856562792926e-07, |
| "loss": 0.0982, |
| "step": 801 |
| }, |
| { |
| "epoch": 1.688421052631579, |
| "grad_norm": 3.143938757336479, |
| "learning_rate": 5.869882433093154e-07, |
| "loss": 0.1176, |
| "step": 802 |
| }, |
| { |
| "epoch": 1.6905263157894737, |
| "grad_norm": 3.1475689679942365, |
| "learning_rate": 5.79239090328883e-07, |
| "loss": 0.084, |
| "step": 803 |
| }, |
| { |
| "epoch": 1.6926315789473683, |
| "grad_norm": 2.8459510622938557, |
| "learning_rate": 5.715382820814885e-07, |
| "loss": 0.0924, |
| "step": 804 |
| }, |
| { |
| "epoch": 1.694736842105263, |
| "grad_norm": 2.7457521722799774, |
| "learning_rate": 5.63885902781941e-07, |
| "loss": 0.1167, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.696842105263158, |
| "grad_norm": 2.642505854996809, |
| "learning_rate": 5.562820361154315e-07, |
| "loss": 0.0883, |
| "step": 806 |
| }, |
| { |
| "epoch": 1.6989473684210528, |
| "grad_norm": 2.6453141131731805, |
| "learning_rate": 5.487267652366291e-07, |
| "loss": 0.1037, |
| "step": 807 |
| }, |
| { |
| "epoch": 1.7010526315789474, |
| "grad_norm": 2.9021664082276577, |
| "learning_rate": 5.412201727687644e-07, |
| "loss": 0.0928, |
| "step": 808 |
| }, |
| { |
| "epoch": 1.703157894736842, |
| "grad_norm": 2.845026463465637, |
| "learning_rate": 5.337623408027293e-07, |
| "loss": 0.0782, |
| "step": 809 |
| }, |
| { |
| "epoch": 1.7052631578947368, |
| "grad_norm": 2.7790818083094355, |
| "learning_rate": 5.263533508961827e-07, |
| "loss": 0.1048, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.7073684210526316, |
| "grad_norm": 3.231780821125863, |
| "learning_rate": 5.189932840726486e-07, |
| "loss": 0.0993, |
| "step": 811 |
| }, |
| { |
| "epoch": 1.7094736842105265, |
| "grad_norm": 2.76959876603585, |
| "learning_rate": 5.116822208206396e-07, |
| "loss": 0.0762, |
| "step": 812 |
| }, |
| { |
| "epoch": 1.711578947368421, |
| "grad_norm": 2.931427414212537, |
| "learning_rate": 5.044202410927707e-07, |
| "loss": 0.1107, |
| "step": 813 |
| }, |
| { |
| "epoch": 1.7136842105263157, |
| "grad_norm": 3.464304450508926, |
| "learning_rate": 4.972074243048896e-07, |
| "loss": 0.1182, |
| "step": 814 |
| }, |
| { |
| "epoch": 1.7157894736842105, |
| "grad_norm": 3.0622245546273468, |
| "learning_rate": 4.900438493352056e-07, |
| "loss": 0.1385, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.7178947368421054, |
| "grad_norm": 2.6699450462801257, |
| "learning_rate": 4.829295945234258e-07, |
| "loss": 0.0927, |
| "step": 816 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 3.50687183202409, |
| "learning_rate": 4.758647376699033e-07, |
| "loss": 0.0874, |
| "step": 817 |
| }, |
| { |
| "epoch": 1.7221052631578946, |
| "grad_norm": 2.4422273374424885, |
| "learning_rate": 4.6884935603477733e-07, |
| "loss": 0.0761, |
| "step": 818 |
| }, |
| { |
| "epoch": 1.7242105263157894, |
| "grad_norm": 3.278144620455897, |
| "learning_rate": 4.6188352633713964e-07, |
| "loss": 0.0836, |
| "step": 819 |
| }, |
| { |
| "epoch": 1.7263157894736842, |
| "grad_norm": 3.3968120192443365, |
| "learning_rate": 4.549673247541875e-07, |
| "loss": 0.0841, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.728421052631579, |
| "grad_norm": 2.8655594086772664, |
| "learning_rate": 4.48100826920394e-07, |
| "loss": 0.0942, |
| "step": 821 |
| }, |
| { |
| "epoch": 1.7305263157894737, |
| "grad_norm": 3.3736122986235455, |
| "learning_rate": 4.412841079266778e-07, |
| "loss": 0.079, |
| "step": 822 |
| }, |
| { |
| "epoch": 1.7326315789473683, |
| "grad_norm": 2.80681634104848, |
| "learning_rate": 4.345172423195865e-07, |
| "loss": 0.078, |
| "step": 823 |
| }, |
| { |
| "epoch": 1.7347368421052631, |
| "grad_norm": 3.017731325212494, |
| "learning_rate": 4.27800304100478e-07, |
| "loss": 0.1121, |
| "step": 824 |
| }, |
| { |
| "epoch": 1.736842105263158, |
| "grad_norm": 2.1538534908319016, |
| "learning_rate": 4.211333667247125e-07, |
| "loss": 0.0624, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.7389473684210528, |
| "grad_norm": 3.051347701639027, |
| "learning_rate": 4.1451650310085076e-07, |
| "loss": 0.1013, |
| "step": 826 |
| }, |
| { |
| "epoch": 1.7410526315789474, |
| "grad_norm": 2.878022799209242, |
| "learning_rate": 4.079497855898501e-07, |
| "loss": 0.0813, |
| "step": 827 |
| }, |
| { |
| "epoch": 1.743157894736842, |
| "grad_norm": 2.7337628143067145, |
| "learning_rate": 4.01433286004283e-07, |
| "loss": 0.0888, |
| "step": 828 |
| }, |
| { |
| "epoch": 1.7452631578947368, |
| "grad_norm": 3.3634783248963607, |
| "learning_rate": 3.949670756075447e-07, |
| "loss": 0.1221, |
| "step": 829 |
| }, |
| { |
| "epoch": 1.7473684210526317, |
| "grad_norm": 2.4140802029160118, |
| "learning_rate": 3.885512251130763e-07, |
| "loss": 0.078, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.7494736842105263, |
| "grad_norm": 2.4747610892496494, |
| "learning_rate": 3.8218580468359136e-07, |
| "loss": 0.0878, |
| "step": 831 |
| }, |
| { |
| "epoch": 1.751578947368421, |
| "grad_norm": 2.8846811138167348, |
| "learning_rate": 3.7587088393030604e-07, |
| "loss": 0.1013, |
| "step": 832 |
| }, |
| { |
| "epoch": 1.7536842105263157, |
| "grad_norm": 3.2969467838779787, |
| "learning_rate": 3.6960653191218333e-07, |
| "loss": 0.1004, |
| "step": 833 |
| }, |
| { |
| "epoch": 1.7557894736842106, |
| "grad_norm": 2.876697595239593, |
| "learning_rate": 3.6339281713517304e-07, |
| "loss": 0.0822, |
| "step": 834 |
| }, |
| { |
| "epoch": 1.7578947368421054, |
| "grad_norm": 2.600705895879165, |
| "learning_rate": 3.572298075514652e-07, |
| "loss": 0.0929, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.76, |
| "grad_norm": 4.100244906164762, |
| "learning_rate": 3.511175705587433e-07, |
| "loss": 0.0931, |
| "step": 836 |
| }, |
| { |
| "epoch": 1.7621052631578946, |
| "grad_norm": 2.776561934122947, |
| "learning_rate": 3.450561729994534e-07, |
| "loss": 0.0749, |
| "step": 837 |
| }, |
| { |
| "epoch": 1.7642105263157895, |
| "grad_norm": 2.8067264258839106, |
| "learning_rate": 3.390456811600673e-07, |
| "loss": 0.1108, |
| "step": 838 |
| }, |
| { |
| "epoch": 1.7663157894736843, |
| "grad_norm": 2.568562849379649, |
| "learning_rate": 3.3308616077036113e-07, |
| "loss": 0.085, |
| "step": 839 |
| }, |
| { |
| "epoch": 1.768421052631579, |
| "grad_norm": 3.1802847449938105, |
| "learning_rate": 3.271776770026963e-07, |
| "loss": 0.1122, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.7705263157894737, |
| "grad_norm": 3.4764158023876472, |
| "learning_rate": 3.213202944713023e-07, |
| "loss": 0.1063, |
| "step": 841 |
| }, |
| { |
| "epoch": 1.7726315789473683, |
| "grad_norm": 3.390798613362417, |
| "learning_rate": 3.1551407723157734e-07, |
| "loss": 0.1008, |
| "step": 842 |
| }, |
| { |
| "epoch": 1.7747368421052632, |
| "grad_norm": 3.038218122247456, |
| "learning_rate": 3.0975908877938277e-07, |
| "loss": 0.1125, |
| "step": 843 |
| }, |
| { |
| "epoch": 1.776842105263158, |
| "grad_norm": 3.3457689829793487, |
| "learning_rate": 3.040553920503503e-07, |
| "loss": 0.1072, |
| "step": 844 |
| }, |
| { |
| "epoch": 1.7789473684210526, |
| "grad_norm": 3.2377107894870925, |
| "learning_rate": 2.984030494191942e-07, |
| "loss": 0.0912, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.7810526315789472, |
| "grad_norm": 2.750908478454545, |
| "learning_rate": 2.928021226990263e-07, |
| "loss": 0.0792, |
| "step": 846 |
| }, |
| { |
| "epoch": 1.783157894736842, |
| "grad_norm": 2.400795013700124, |
| "learning_rate": 2.8725267314068496e-07, |
| "loss": 0.0762, |
| "step": 847 |
| }, |
| { |
| "epoch": 1.7852631578947369, |
| "grad_norm": 3.110512202087771, |
| "learning_rate": 2.817547614320615e-07, |
| "loss": 0.0764, |
| "step": 848 |
| }, |
| { |
| "epoch": 1.7873684210526317, |
| "grad_norm": 2.5090668483357805, |
| "learning_rate": 2.763084476974376e-07, |
| "loss": 0.1128, |
| "step": 849 |
| }, |
| { |
| "epoch": 1.7894736842105263, |
| "grad_norm": 2.2736980741070543, |
| "learning_rate": 2.7091379149682683e-07, |
| "loss": 0.0636, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.791578947368421, |
| "grad_norm": 2.8650745187192848, |
| "learning_rate": 2.655708518253258e-07, |
| "loss": 0.0841, |
| "step": 851 |
| }, |
| { |
| "epoch": 1.7936842105263158, |
| "grad_norm": 2.81864315109065, |
| "learning_rate": 2.602796871124663e-07, |
| "loss": 0.091, |
| "step": 852 |
| }, |
| { |
| "epoch": 1.7957894736842106, |
| "grad_norm": 2.749063211163363, |
| "learning_rate": 2.5504035522157853e-07, |
| "loss": 0.0943, |
| "step": 853 |
| }, |
| { |
| "epoch": 1.7978947368421052, |
| "grad_norm": 3.0229430850580945, |
| "learning_rate": 2.4985291344915675e-07, |
| "loss": 0.1269, |
| "step": 854 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 2.9239017619537053, |
| "learning_rate": 2.447174185242324e-07, |
| "loss": 0.0911, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.8021052631578947, |
| "grad_norm": 2.616988483077732, |
| "learning_rate": 2.3963392660775576e-07, |
| "loss": 0.083, |
| "step": 856 |
| }, |
| { |
| "epoch": 1.8042105263157895, |
| "grad_norm": 3.2018633874558917, |
| "learning_rate": 2.3460249329197825e-07, |
| "loss": 0.0969, |
| "step": 857 |
| }, |
| { |
| "epoch": 1.8063157894736843, |
| "grad_norm": 3.1492659688194813, |
| "learning_rate": 2.296231735998511e-07, |
| "loss": 0.0892, |
| "step": 858 |
| }, |
| { |
| "epoch": 1.808421052631579, |
| "grad_norm": 3.374868955965745, |
| "learning_rate": 2.2469602198441575e-07, |
| "loss": 0.1232, |
| "step": 859 |
| }, |
| { |
| "epoch": 1.8105263157894735, |
| "grad_norm": 2.910000563908748, |
| "learning_rate": 2.198210923282118e-07, |
| "loss": 0.1016, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.8126315789473684, |
| "grad_norm": 2.7882770708484728, |
| "learning_rate": 2.149984379426906e-07, |
| "loss": 0.0752, |
| "step": 861 |
| }, |
| { |
| "epoch": 1.8147368421052632, |
| "grad_norm": 3.1470017852346266, |
| "learning_rate": 2.102281115676258e-07, |
| "loss": 0.102, |
| "step": 862 |
| }, |
| { |
| "epoch": 1.816842105263158, |
| "grad_norm": 3.2300809492562004, |
| "learning_rate": 2.0551016537054492e-07, |
| "loss": 0.1049, |
| "step": 863 |
| }, |
| { |
| "epoch": 1.8189473684210526, |
| "grad_norm": 2.8796612295432484, |
| "learning_rate": 2.008446509461498e-07, |
| "loss": 0.0808, |
| "step": 864 |
| }, |
| { |
| "epoch": 1.8210526315789473, |
| "grad_norm": 2.759597506021318, |
| "learning_rate": 1.962316193157593e-07, |
| "loss": 0.0695, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.823157894736842, |
| "grad_norm": 3.442533845727977, |
| "learning_rate": 1.91671120926748e-07, |
| "loss": 0.0753, |
| "step": 866 |
| }, |
| { |
| "epoch": 1.825263157894737, |
| "grad_norm": 2.966239208218966, |
| "learning_rate": 1.871632056519962e-07, |
| "loss": 0.1021, |
| "step": 867 |
| }, |
| { |
| "epoch": 1.8273684210526315, |
| "grad_norm": 2.762544594000791, |
| "learning_rate": 1.8270792278934302e-07, |
| "loss": 0.0986, |
| "step": 868 |
| }, |
| { |
| "epoch": 1.8294736842105264, |
| "grad_norm": 3.4023336405774054, |
| "learning_rate": 1.7830532106104747e-07, |
| "loss": 0.0895, |
| "step": 869 |
| }, |
| { |
| "epoch": 1.831578947368421, |
| "grad_norm": 2.7309690429581677, |
| "learning_rate": 1.7395544861325718e-07, |
| "loss": 0.0782, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.8336842105263158, |
| "grad_norm": 2.8560692270110053, |
| "learning_rate": 1.696583530154794e-07, |
| "loss": 0.0902, |
| "step": 871 |
| }, |
| { |
| "epoch": 1.8357894736842106, |
| "grad_norm": 3.0786659593547476, |
| "learning_rate": 1.6541408126006464e-07, |
| "loss": 0.1019, |
| "step": 872 |
| }, |
| { |
| "epoch": 1.8378947368421052, |
| "grad_norm": 2.714427085102136, |
| "learning_rate": 1.6122267976168783e-07, |
| "loss": 0.1038, |
| "step": 873 |
| }, |
| { |
| "epoch": 1.8399999999999999, |
| "grad_norm": 4.060810899220369, |
| "learning_rate": 1.5708419435684463e-07, |
| "loss": 0.0954, |
| "step": 874 |
| }, |
| { |
| "epoch": 1.8421052631578947, |
| "grad_norm": 2.9436675014639704, |
| "learning_rate": 1.5299867030334815e-07, |
| "loss": 0.0716, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.8442105263157895, |
| "grad_norm": 2.5848552590925067, |
| "learning_rate": 1.4896615227983468e-07, |
| "loss": 0.0553, |
| "step": 876 |
| }, |
| { |
| "epoch": 1.8463157894736844, |
| "grad_norm": 2.7187088217026503, |
| "learning_rate": 1.4498668438527597e-07, |
| "loss": 0.0699, |
| "step": 877 |
| }, |
| { |
| "epoch": 1.848421052631579, |
| "grad_norm": 2.6374619830581283, |
| "learning_rate": 1.4106031013849498e-07, |
| "loss": 0.0715, |
| "step": 878 |
| }, |
| { |
| "epoch": 1.8505263157894736, |
| "grad_norm": 2.7359260474957736, |
| "learning_rate": 1.3718707247769137e-07, |
| "loss": 0.082, |
| "step": 879 |
| }, |
| { |
| "epoch": 1.8526315789473684, |
| "grad_norm": 2.8874455937698236, |
| "learning_rate": 1.333670137599713e-07, |
| "loss": 0.113, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.8547368421052632, |
| "grad_norm": 2.3984320677740256, |
| "learning_rate": 1.2960017576088445e-07, |
| "loss": 0.0662, |
| "step": 881 |
| }, |
| { |
| "epoch": 1.8568421052631578, |
| "grad_norm": 2.631557349703051, |
| "learning_rate": 1.2588659967396998e-07, |
| "loss": 0.0591, |
| "step": 882 |
| }, |
| { |
| "epoch": 1.8589473684210527, |
| "grad_norm": 2.8554369642818824, |
| "learning_rate": 1.222263261102985e-07, |
| "loss": 0.0863, |
| "step": 883 |
| }, |
| { |
| "epoch": 1.8610526315789473, |
| "grad_norm": 3.824376982589371, |
| "learning_rate": 1.1861939509803688e-07, |
| "loss": 0.0965, |
| "step": 884 |
| }, |
| { |
| "epoch": 1.8631578947368421, |
| "grad_norm": 2.559431945156892, |
| "learning_rate": 1.1506584608200366e-07, |
| "loss": 0.0955, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.865263157894737, |
| "grad_norm": 2.3957520209090055, |
| "learning_rate": 1.1156571792324212e-07, |
| "loss": 0.0595, |
| "step": 886 |
| }, |
| { |
| "epoch": 1.8673684210526316, |
| "grad_norm": 2.3363953763581704, |
| "learning_rate": 1.0811904889859337e-07, |
| "loss": 0.0745, |
| "step": 887 |
| }, |
| { |
| "epoch": 1.8694736842105262, |
| "grad_norm": 3.0084512102418355, |
| "learning_rate": 1.0472587670027678e-07, |
| "loss": 0.1011, |
| "step": 888 |
| }, |
| { |
| "epoch": 1.871578947368421, |
| "grad_norm": 2.7795110502364047, |
| "learning_rate": 1.0138623843548078e-07, |
| "loss": 0.0806, |
| "step": 889 |
| }, |
| { |
| "epoch": 1.8736842105263158, |
| "grad_norm": 2.1652550097362484, |
| "learning_rate": 9.810017062595322e-08, |
| "loss": 0.0608, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.8757894736842107, |
| "grad_norm": 2.6793941600419444, |
| "learning_rate": 9.486770920760668e-08, |
| "loss": 0.0806, |
| "step": 891 |
| }, |
| { |
| "epoch": 1.8778947368421053, |
| "grad_norm": 3.2201122598929364, |
| "learning_rate": 9.16888895301199e-08, |
| "loss": 0.1135, |
| "step": 892 |
| }, |
| { |
| "epoch": 1.88, |
| "grad_norm": 2.566921068911272, |
| "learning_rate": 8.856374635655696e-08, |
| "loss": 0.0688, |
| "step": 893 |
| }, |
| { |
| "epoch": 1.8821052631578947, |
| "grad_norm": 3.1079888316462565, |
| "learning_rate": 8.549231386298151e-08, |
| "loss": 0.0997, |
| "step": 894 |
| }, |
| { |
| "epoch": 1.8842105263157896, |
| "grad_norm": 2.7999506483166305, |
| "learning_rate": 8.247462563808816e-08, |
| "loss": 0.0946, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.8863157894736842, |
| "grad_norm": 3.367662443206227, |
| "learning_rate": 7.951071468283166e-08, |
| "loss": 0.0998, |
| "step": 896 |
| }, |
| { |
| "epoch": 1.888421052631579, |
| "grad_norm": 4.550576913048775, |
| "learning_rate": 7.660061341006719e-08, |
| "loss": 0.1035, |
| "step": 897 |
| }, |
| { |
| "epoch": 1.8905263157894736, |
| "grad_norm": 3.2490454117201626, |
| "learning_rate": 7.374435364419675e-08, |
| "loss": 0.1205, |
| "step": 898 |
| }, |
| { |
| "epoch": 1.8926315789473684, |
| "grad_norm": 3.011502455562598, |
| "learning_rate": 7.094196662081832e-08, |
| "loss": 0.1008, |
| "step": 899 |
| }, |
| { |
| "epoch": 1.8947368421052633, |
| "grad_norm": 2.974407865337836, |
| "learning_rate": 6.819348298638839e-08, |
| "loss": 0.0779, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.8968421052631579, |
| "grad_norm": 2.895980879467302, |
| "learning_rate": 6.549893279788278e-08, |
| "loss": 0.0805, |
| "step": 901 |
| }, |
| { |
| "epoch": 1.8989473684210525, |
| "grad_norm": 2.8922313610110923, |
| "learning_rate": 6.285834552247127e-08, |
| "loss": 0.1067, |
| "step": 902 |
| }, |
| { |
| "epoch": 1.9010526315789473, |
| "grad_norm": 3.353278867338892, |
| "learning_rate": 6.027175003719354e-08, |
| "loss": 0.0958, |
| "step": 903 |
| }, |
| { |
| "epoch": 1.9031578947368422, |
| "grad_norm": 2.650567655819524, |
| "learning_rate": 5.773917462864265e-08, |
| "loss": 0.0702, |
| "step": 904 |
| }, |
| { |
| "epoch": 1.905263157894737, |
| "grad_norm": 2.6264185528158834, |
| "learning_rate": 5.526064699265754e-08, |
| "loss": 0.0912, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.9073684210526316, |
| "grad_norm": 2.527286884707186, |
| "learning_rate": 5.2836194234019976e-08, |
| "loss": 0.0771, |
| "step": 906 |
| }, |
| { |
| "epoch": 1.9094736842105262, |
| "grad_norm": 2.612912180177236, |
| "learning_rate": 5.0465842866156965e-08, |
| "loss": 0.0666, |
| "step": 907 |
| }, |
| { |
| "epoch": 1.911578947368421, |
| "grad_norm": 2.8431243287193877, |
| "learning_rate": 4.8149618810850454e-08, |
| "loss": 0.0802, |
| "step": 908 |
| }, |
| { |
| "epoch": 1.9136842105263159, |
| "grad_norm": 2.5645766842154276, |
| "learning_rate": 4.588754739795587e-08, |
| "loss": 0.0793, |
| "step": 909 |
| }, |
| { |
| "epoch": 1.9157894736842105, |
| "grad_norm": 3.545293325505326, |
| "learning_rate": 4.367965336512403e-08, |
| "loss": 0.0987, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.917894736842105, |
| "grad_norm": 3.0722983206898045, |
| "learning_rate": 4.1525960857530244e-08, |
| "loss": 0.0804, |
| "step": 911 |
| }, |
| { |
| "epoch": 1.92, |
| "grad_norm": 2.079813956741024, |
| "learning_rate": 3.9426493427611177e-08, |
| "loss": 0.0681, |
| "step": 912 |
| }, |
| { |
| "epoch": 1.9221052631578948, |
| "grad_norm": 3.574578189030846, |
| "learning_rate": 3.738127403480507e-08, |
| "loss": 0.1216, |
| "step": 913 |
| }, |
| { |
| "epoch": 1.9242105263157896, |
| "grad_norm": 2.575539056464793, |
| "learning_rate": 3.5390325045304704e-08, |
| "loss": 0.0517, |
| "step": 914 |
| }, |
| { |
| "epoch": 1.9263157894736842, |
| "grad_norm": 3.0048897811038255, |
| "learning_rate": 3.345366823180929e-08, |
| "loss": 0.0931, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.9284210526315788, |
| "grad_norm": 3.046405391039599, |
| "learning_rate": 3.1571324773286284e-08, |
| "loss": 0.0969, |
| "step": 916 |
| }, |
| { |
| "epoch": 1.9305263157894736, |
| "grad_norm": 3.5520843745316872, |
| "learning_rate": 2.9743315254743834e-08, |
| "loss": 0.0782, |
| "step": 917 |
| }, |
| { |
| "epoch": 1.9326315789473685, |
| "grad_norm": 2.7132714135482208, |
| "learning_rate": 2.7969659666999273e-08, |
| "loss": 0.1037, |
| "step": 918 |
| }, |
| { |
| "epoch": 1.9347368421052633, |
| "grad_norm": 3.123212441352547, |
| "learning_rate": 2.625037740646763e-08, |
| "loss": 0.0882, |
| "step": 919 |
| }, |
| { |
| "epoch": 1.936842105263158, |
| "grad_norm": 3.646324477345249, |
| "learning_rate": 2.4585487274942922e-08, |
| "loss": 0.1263, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.9389473684210525, |
| "grad_norm": 2.5318288248240397, |
| "learning_rate": 2.2975007479397736e-08, |
| "loss": 0.0716, |
| "step": 921 |
| }, |
| { |
| "epoch": 1.9410526315789474, |
| "grad_norm": 3.2685591223413817, |
| "learning_rate": 2.1418955631781203e-08, |
| "loss": 0.1168, |
| "step": 922 |
| }, |
| { |
| "epoch": 1.9431578947368422, |
| "grad_norm": 3.2425891311790136, |
| "learning_rate": 1.9917348748826337e-08, |
| "loss": 0.1161, |
| "step": 923 |
| }, |
| { |
| "epoch": 1.9452631578947368, |
| "grad_norm": 3.4814985891690884, |
| "learning_rate": 1.847020325186577e-08, |
| "loss": 0.0773, |
| "step": 924 |
| }, |
| { |
| "epoch": 1.9473684210526314, |
| "grad_norm": 2.9962557150283393, |
| "learning_rate": 1.7077534966650767e-08, |
| "loss": 0.1099, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.9494736842105262, |
| "grad_norm": 3.1838150741664784, |
| "learning_rate": 1.5739359123178587e-08, |
| "loss": 0.1119, |
| "step": 926 |
| }, |
| { |
| "epoch": 1.951578947368421, |
| "grad_norm": 2.9531176654422855, |
| "learning_rate": 1.4455690355525964e-08, |
| "loss": 0.0849, |
| "step": 927 |
| }, |
| { |
| "epoch": 1.953684210526316, |
| "grad_norm": 3.237219758595749, |
| "learning_rate": 1.3226542701689215e-08, |
| "loss": 0.0956, |
| "step": 928 |
| }, |
| { |
| "epoch": 1.9557894736842105, |
| "grad_norm": 2.6636107881636657, |
| "learning_rate": 1.2051929603428824e-08, |
| "loss": 0.0833, |
| "step": 929 |
| }, |
| { |
| "epoch": 1.9578947368421051, |
| "grad_norm": 3.0753931013806675, |
| "learning_rate": 1.0931863906127327e-08, |
| "loss": 0.096, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.96, |
| "grad_norm": 3.395827147776403, |
| "learning_rate": 9.866357858642206e-09, |
| "loss": 0.0886, |
| "step": 931 |
| }, |
| { |
| "epoch": 1.9621052631578948, |
| "grad_norm": 2.4050339156958307, |
| "learning_rate": 8.855423113177664e-09, |
| "loss": 0.0751, |
| "step": 932 |
| }, |
| { |
| "epoch": 1.9642105263157896, |
| "grad_norm": 2.6642559497093905, |
| "learning_rate": 7.899070725153612e-09, |
| "loss": 0.0684, |
| "step": 933 |
| }, |
| { |
| "epoch": 1.9663157894736842, |
| "grad_norm": 3.73805463863678, |
| "learning_rate": 6.997311153086883e-09, |
| "loss": 0.0979, |
| "step": 934 |
| }, |
| { |
| "epoch": 1.9684210526315788, |
| "grad_norm": 2.980994237992295, |
| "learning_rate": 6.150154258476315e-09, |
| "loss": 0.0797, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.9705263157894737, |
| "grad_norm": 3.4371787351109333, |
| "learning_rate": 5.357609305692291e-09, |
| "loss": 0.103, |
| "step": 936 |
| }, |
| { |
| "epoch": 1.9726315789473685, |
| "grad_norm": 2.6478564332335193, |
| "learning_rate": 4.619684961881255e-09, |
| "loss": 0.0771, |
| "step": 937 |
| }, |
| { |
| "epoch": 1.9747368421052631, |
| "grad_norm": 2.4729221890510167, |
| "learning_rate": 3.936389296864129e-09, |
| "loss": 0.0667, |
| "step": 938 |
| }, |
| { |
| "epoch": 1.9768421052631577, |
| "grad_norm": 2.9102631943004686, |
| "learning_rate": 3.307729783054159e-09, |
| "loss": 0.1091, |
| "step": 939 |
| }, |
| { |
| "epoch": 1.9789473684210526, |
| "grad_norm": 2.655897353757651, |
| "learning_rate": 2.7337132953697555e-09, |
| "loss": 0.058, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.9810526315789474, |
| "grad_norm": 2.971572950538632, |
| "learning_rate": 2.214346111164556e-09, |
| "loss": 0.1048, |
| "step": 941 |
| }, |
| { |
| "epoch": 1.9831578947368422, |
| "grad_norm": 2.889850391546724, |
| "learning_rate": 1.749633910153592e-09, |
| "loss": 0.087, |
| "step": 942 |
| }, |
| { |
| "epoch": 1.9852631578947368, |
| "grad_norm": 2.9975744327597327, |
| "learning_rate": 1.3395817743561135e-09, |
| "loss": 0.0898, |
| "step": 943 |
| }, |
| { |
| "epoch": 1.9873684210526315, |
| "grad_norm": 2.754815439262134, |
| "learning_rate": 9.841941880361917e-10, |
| "loss": 0.0916, |
| "step": 944 |
| }, |
| { |
| "epoch": 1.9894736842105263, |
| "grad_norm": 3.2077861868156314, |
| "learning_rate": 6.834750376549793e-10, |
| "loss": 0.1003, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.9915789473684211, |
| "grad_norm": 2.908656770112915, |
| "learning_rate": 4.374276118301879e-10, |
| "loss": 0.0828, |
| "step": 946 |
| }, |
| { |
| "epoch": 1.993684210526316, |
| "grad_norm": 2.933260030488579, |
| "learning_rate": 2.4605460129556446e-10, |
| "loss": 0.09, |
| "step": 947 |
| }, |
| { |
| "epoch": 1.9957894736842106, |
| "grad_norm": 2.4387802348581027, |
| "learning_rate": 1.0935809887702154e-10, |
| "loss": 0.0727, |
| "step": 948 |
| }, |
| { |
| "epoch": 1.9978947368421052, |
| "grad_norm": 2.543159609703416, |
| "learning_rate": 2.733959946432663e-11, |
| "loss": 0.0934, |
| "step": 949 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 2.6404824543206735, |
| "learning_rate": 0.0, |
| "loss": 0.0841, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 950, |
| "total_flos": 1782751297536.0, |
| "train_loss": 0.18172799837824546, |
| "train_runtime": 438.8142, |
| "train_samples_per_second": 17.315, |
| "train_steps_per_second": 2.165 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 950, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1782751297536.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|